Merge pull request #308 from InfiniTensor/issue/307

issue/307 unify test tensor creation in pytorch tests

Merge pull request #308 from InfiniTensor/issue/307
issue/307 unify test tensor creation in pytorch tests
2790a7b2 · PanZezhong1725 · GitHub · 70146b74 · f62e952e · 2790a7b2
Unverified Commit 2790a7b2 authored Jul 08, 2025 by PanZezhong1725 Committed by GitHub Jul 08, 2025
20 changed files
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,15 +4,10 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/attention.h"
-#include "infiniop/ops/avg_pool.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
-#include "infiniop/ops/expand.h"
 #include "infiniop/ops/gemm.h"
-#include "infiniop/ops/global_avg_pool.h"
-#include "infiniop/ops/max_pool.h"
-#include "infiniop/ops/mlp.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"

--- a/include/infiniop/ops/avg_pool.h
+++ b/include/infiniop/ops/avg_pool.h
-#ifndef __INFINIOP_AVG_POOL_API_H__
-#define __INFINIOP_AVG_POOL_API_H__
-#include "../operator_descriptor.h"
-typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
-__C __export infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
-                                                            infiniopAvgPoolDescriptor_t *desc_ptr,
-                                                            infiniopTensorDescriptor_t y,
-                                                            infiniopTensorDescriptor_t x,
-                                                            size_t const *kernel_shape,
-                                                            size_t const *pads,
-                                                            ptrdiff_t const *strides,
-                                                            size_t n);
-__C __export infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size);
-__C __export infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
-                                            void *workspace, size_t workspace_size,
-                                            void *y, void const *x, void *stream);
-__C __export infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
-#endif
--- a/include/infiniop/ops/expand.h
+++ b/include/infiniop/ops/expand.h
-#ifndef __INFINIOP_EXPAND_API_H__
-#define __INFINIOP_EXPAND_API_H__
-#include "../operator_descriptor.h"
-typedef struct InfiniopDescriptor *infiniopExpandDescriptor_t;
-__C __export infiniStatus_t infiniopCreateExpandDescriptor(infiniopHandle_t handle,
-                                                           infiniopExpandDescriptor_t *desc_ptr,
-                                                           infiniopTensorDescriptor_t y,
-                                                           infiniopTensorDescriptor_t x);
-__C __export infiniStatus_t infiniopExpand(infiniopExpandDescriptor_t desc,
-                                           void *y,
-                                           void const *x,
-                                           void *stream);
-__C __export infiniStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc);
-#endif
--- a/include/infiniop/ops/global_avg_pool.h
+++ b/include/infiniop/ops/global_avg_pool.h
-#ifndef __INFINIOP_GLOBAL_AVG_POOL_API_H__
-#define __INFINIOP_GLOBAL_AVG_POOL_API_H__
-#include "../operator_descriptor.h"
-typedef struct InfiniopDescriptor *infiniopGlobalAvgPoolDescriptor_t;
-__C __export infiniStatus_t infiniopCreateGlobalAvgPoolDescriptor(infiniopHandle_t handle,
-                                                                  infiniopGlobalAvgPoolDescriptor_t *desc_ptr,
-                                                                  infiniopTensorDescriptor_t y,
-                                                                  infiniopTensorDescriptor_t x);
-__C __export infiniStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, size_t *size);
-__C __export infiniStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc,
-                                                  void *workspace, size_t workspace_size,
-                                                  void *y, void const *x, void *stream);
-__C __export infiniStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc);
-#endif
--- a/include/infiniop/ops/max_pool.h
+++ b/include/infiniop/ops/max_pool.h
-#ifndef __INFINIOP_MAX_POOL_API_H__
-#define __INFINIOP_MAX_POOL_API_H__
-#include "../operator_descriptor.h"
-typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
-__C __export infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
-                                                            infiniopMaxPoolDescriptor_t *desc_ptr,
-                                                            infiniopTensorDescriptor_t y,
-                                                            infiniopTensorDescriptor_t x,
-                                                            size_t const *kernel_shape,
-                                                            size_t const *pads,
-                                                            ptrdiff_t const *strides,
-                                                            size_t n);
-__C __export infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size);
-__C __export infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
-                                            void *workspace, size_t workspace_size,
-                                            void *y, void const *x, void *stream);
-__C __export infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
-#endif
--- a/include/infiniop/ops/mlp.h
+++ b/include/infiniop/ops/mlp.h
-#ifndef __INFINIOP_MLP_API_H__
-#define __INFINIOP_MLP_API_H__
-#include "../operator_descriptor.h"
-#include "gemm.h"
-#include "swiglu.h"
-typedef struct InfiniopDescriptor *infiniopMLPDescriptor_t;
-__C __export infiniStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle,
-                                                        infiniopMLPDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y_desc,
-                                                        infiniopTensorDescriptor_t x_desc,
-                                                        infiniopTensorDescriptor_t w12_desc,
-                                                        infiniopTensorDescriptor_t w3_desc,
-                                                        float alpha,
-                                                        char residual);
-__C __export infiniStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, size_t *size);
-__C __export infiniStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        const void *w12,
-                                        const void *w3,
-                                        void *stream);
-__C __export infiniStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc);
-#endif
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -13,17 +13,17 @@ def run_tests(args):
    failed = []
    for test in [
        "add.py",
+        "attention.py",
+        "causal_softmax.py",
        "clip.py",
        "gemm.py",
+        "mul.py",
        "random_sample.py",
+        "rearrange.py",
        "rms_norm.py",
        "rope.py",
        "sub.py",
        "swiglu.py",
-        "attention.py",
-        "causal_softmax.py",
-        "rearrange.py",
-        "mul.py",
    ]:
        result = subprocess.run(
            f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True

--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
+    LIBINFINIOP,
-    infiniopTensorDescriptor_t,
+    TestTensor,
-    open_lib,
-    to_tensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
-    create_workspace,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto
@@ -58,12 +59,12 @@ _TEST_CASES = [
 ]
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
 }
 DEBUG = False
@@ -72,52 +73,13 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
-class AddDescriptor(Structure):
+def add(c, a, b):
-    _fields_ = [("device", c_int32)]
+    torch.add(a, b, out=c)
-infiniopAddDescriptor_t = POINTER(AddDescriptor)
-def add(ans, x, y):
-    torch.add(x, y, out=ans)
-def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
-    """
-    rearrange the tensors if needed and apply the inplace config.
-    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
-    the inplace config is ignored and out-of-place is used
-    """
-    original_c_strides = c_strides if c_strides else c.stride()
-    def _rearrange(tensor, strides):
-        if strides and 0 in strides:
-            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
-            return tensor
-        else:
-            return rearrange_if_needed(tensor, strides)
-    a, b, c = [
-        _rearrange(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
-    ]
-    c = (
-        c
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
-    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
-    if 0 in c.stride():
-        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
-    return a, b, c
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    a_stride=None,
    b_stride=None,
@@ -126,58 +88,64 @@ def test(
    dtype=torch.float16,
    sync=None,
 ):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="ones")
+    if c.is_broadcast():
+        return
    print(
-        f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"Testing Add on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{dtype} inplace:{inplace}"
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )
-    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
-    ans = torch.zeros(shape, dtype=dtype).to(torch_device)
-    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
-    add(ans, a, b)
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-    c_tensor = (
-        to_tensor(c, lib)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
-    )
    if sync is not None:
        sync()
-    descriptor = infiniopAddDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateAddDescriptor(
+        LIBINFINIOP.infiniopCreateAddDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
+            c.descriptor,
-            a_tensor.descriptor,
+            a.descriptor,
-            b_tensor.descriptor,
+            b.descriptor,
        )
    )
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
+    for tensor in [a, b, c]:
-        tensor.destroyDesc(lib)
+        tensor.destroy_desc()
    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetAddWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, c.device)
+    workspace = TestWorkspace(workspace_size.value, c.device)
    def lib_add():
        check_error(
-            lib.infiniopAdd(
+            LIBINFINIOP.infiniopAdd(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
-                workspace_size.value,
+                workspace.size(),
-                c_tensor.data,
+                c.data(),
-                a_tensor.data,
+                a.data(),
-                b_tensor.data,
+                b.data(),
                None,
            )
        )
@@ -186,52 +154,20 @@ def test(
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: add(ans, a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_add(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyAddDescriptor(descriptor))
 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateAddDescriptor.restype = c_int32
-    lib.infiniopCreateAddDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopAddDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopGetAddWorkspaceSize.restype = c_int32
-    lib.infiniopGetAddWorkspaceSize.argtypes = [
-        infiniopAddDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopAdd.restype = c_int32
-    lib.infiniopAdd.argtypes = [
-        infiniopAddDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyAddDescriptor.restype = c_int32
-    lib.infiniopDestroyAddDescriptor.argtypes = [
-        infiniopAddDescriptor_t,
-    ]
    # Configure testing options
    DEBUG = args.debug
@@ -240,6 +176,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+from ctypes import c_uint64
 import ctypes
 import sys
 import os
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from libinfiniop import (
-    open_lib,
+    LIBINFINIOP,
-    to_tensor,
+    TestTensor,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    check_error,
-    rearrange_tensor,
-    create_workspace,
-    get_args,
    get_test_devices,
+    check_error,
    test_operator,
+    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 import torch
-class AttentionDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
 def causal_softmax(x):
    type = x.dtype
    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
@@ -85,9 +79,8 @@ def attention(q, k, v, k_cache, v_cache, pos):
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    n_q_head,
    n_kv_head,
    seq_len,
@@ -100,94 +93,79 @@ def test(
    v_stride=None,
    k_cache_stride=None,
    v_cache_stride=None,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
+        f"Testing Attention on {InfiniDeviceNames[device]} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
-        f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
+        f"dtype:{InfiniDtypeNames[dtype]} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
    )
-    out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
+    out = TestTensor([seq_len, n_q_head, head_dim], None, dtype, device, mode="zeros")
-    q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    q = TestTensor([n_q_head, seq_len, head_dim], q_stride, dtype, device, scale=0.1)
-    k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k = TestTensor([n_kv_head, seq_len, head_dim], k_stride, dtype, device, scale=0.1)
-    v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    v = TestTensor([n_kv_head, seq_len, head_dim], v_stride, dtype, device, scale=0.1)
-    k_cache = (
+    k_cache = TestTensor(
-        torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        [n_kv_head, k_cache_buf_len, head_dim], k_cache_stride, dtype, device, scale=0.1
-        * 0.1
    )
-    v_cache = (
+    v_cache = TestTensor(
-        torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        [n_kv_head, v_cache_buf_len, head_dim], v_cache_stride, dtype, device, scale=0.1
-        * 0.1
    )
-    ans = attention(q, k, v, k_cache, v_cache, pos)
+    def torch_attention():
+        return attention(
-    if q_stride is not None:
+            q.torch_tensor(),
-        q = rearrange_tensor(q, q_stride)
+            k.torch_tensor(),
-    if k_stride is not None:
+            v.torch_tensor(),
-        k = rearrange_tensor(k, k_stride)
+            k_cache.torch_tensor(),
-    if v_stride is not None:
+            v_cache.torch_tensor(),
-        v = rearrange_tensor(v, v_stride)
+            pos,
-    if k_cache_stride is not None:
+        )
-        k_cache = rearrange_tensor(k_cache, k_cache_stride)
-    if v_cache_stride is not None:
+    ans = torch_attention()
-        v_cache = rearrange_tensor(v_cache, v_cache_stride)
-    out_tensor = to_tensor(out, lib)
-    q_tensor = to_tensor(q, lib)
-    k_tensor = to_tensor(k, lib)
-    v_tensor = to_tensor(v, lib)
-    k_cache_tensor = to_tensor(k_cache, lib)
-    v_cache_tensor = to_tensor(v_cache, lib)
    if sync is not None:
        sync()
-    descriptor = infiniopAttentionDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateAttentionDescriptor(
+        LIBINFINIOP.infiniopCreateAttentionDescriptor(
            handle,
            ctypes.byref(descriptor),
-            out_tensor.descriptor,
+            out.descriptor,
-            q_tensor.descriptor,
+            q.descriptor,
-            k_tensor.descriptor,
+            k.descriptor,
-            v_tensor.descriptor,
+            v.descriptor,
-            k_cache_tensor.descriptor,
+            k_cache.descriptor,
-            v_cache_tensor.descriptor,
+            v_cache.descriptor,
            pos,
        )
    )
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [
+    for tensor in [out, q, k, v, k_cache, v_cache]:
-        out_tensor,
+        tensor.destroy_desc()
-        q_tensor,
-        k_tensor,
-        v_tensor,
-        k_cache_tensor,
-        v_cache_tensor,
-    ]:
-        tensor.destroyDesc(lib)
    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetAttentionWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, out.device)
+    workspace = TestWorkspace(workspace_size.value, out.device)
    def lib_attention():
        check_error(
-            lib.infiniopAttention(
+            LIBINFINIOP.infiniopAttention(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                out_tensor.data,
+                out.data(),
-                q_tensor.data,
+                q.data(),
-                k_tensor.data,
+                k.data(),
-                v_tensor.data,
+                v.data(),
-                k_cache_tensor.data,
+                k_cache.data(),
-                v_cache_tensor.data,
+                v_cache.data(),
                None,
            )
        )
@@ -197,25 +175,25 @@ def test(
    # Validate results
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(out, ans, atol=atol, rtol=rtol)
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(out, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: torch_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyAttentionDescriptor(descriptor))
 if __name__ == "__main__":
-    _TENSOR_DTYPES = [torch.float16, torch.float32]
+    _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
    # Tolerance map for different data types
    _TOLERANCE_MAP = {
-        torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+        InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
-        torch.float32: {"atol": 1e-5, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-3},
    }
    DEBUG = False
@@ -284,45 +262,6 @@ if __name__ == "__main__":
        ),
    ]
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateAttentionDescriptor.restype = c_int32
-    lib.infiniopCreateAttentionDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopAttentionDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_uint64,
-    ]
-    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
-    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
-        infiniopAttentionDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopAttention.restype = c_int32
-    lib.infiniopAttention.argtypes = [
-        infiniopAttentionDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
-    lib.infiniopDestroyAttentionDescriptor.argtypes = [
-        infiniopAttentionDescriptor_t,
-    ]
    # Configure testing options
    DEBUG = args.debug
@@ -332,5 +271,5 @@ if __name__ == "__main__":
    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
+        test_operator(device, test, test_cases, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
-import ctypes
-import sys
-import os
-import time
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-from operatorspy.tests.test_utils import get_args
-import torch
-from typing import Tuple
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-class AvgPoolDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
-def pool(x, k, padding, stride, dilation=1):
-    pooling_layers = {
-        1: torch.nn.AvgPool1d,
-        2: torch.nn.AvgPool2d,
-        3: torch.nn.AvgPool3d,
-    }
-    ndim = len(x.shape) - 2
-    if ndim not in pooling_layers:
-        print("Error: Pytorch -> Unsupported tensor dimension")
-        return None
-    if ndim == 3 and x.dtype == torch.float16:
-        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
-            x.to(torch.float32)
-        ).to(torch.float16)
-    else:
-        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
-    if PROFILE:
-        torch.cuda.synchronize()
-    return ans
-def inferShape(x_shape, kernel_shape, padding, strides):
-    assert (
-        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
-    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
-    input_shape = x_shape[2:]
-    output_shape = []
-    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
-        output_dim = (dim + 2 * p - k) // s + 1
-        output_shape.append(output_dim)
-    return x_shape[:2] + tuple(output_shape)
-# convert a python tuple to a ctype void pointer
-def tuple_to_void_p(py_tuple: Tuple):
-    array = ctypes.c_int64 * len(py_tuple)
-    data_array = array(*py_tuple)
-    return ctypes.cast(data_array, ctypes.c_void_p)
-def test(
-    lib,
-    handle,
-    torch_device,
-    x_shape,
-    k_shape,
-    padding,
-    strides,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
-    )
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(
-        inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
-    ).to(torch_device)
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = pool(x, k_shape, padding, strides)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = pool(x, k_shape, padding, strides)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    if sync is not None:
-        sync()
-    descriptor = infiniopAvgPoolDescriptor_t()
-    check_error(
-        lib.infiniopCreateAvgPoolDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            tuple_to_void_p(k_shape),
-            tuple_to_void_p(padding),
-            tuple_to_void_p(strides),
-            len(k_shape),
-        )
-    )
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-    workspaceSize = ctypes.c_uint64(0)
-    check_error(
-        lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
-    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
-        torch_device
-    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopAvgPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
-        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopAvgPool(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-def test_bang(lib, test_cases):
-    import torch_mlu
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-if __name__ == "__main__":
-    test_cases = [
-        # fmt: off
-        # x_shape, kernel_shape, padding, strides
-        ((1, 1, 10), (3,), (1,), (1,)),
-        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
-        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
-        # fmt: on
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
-    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopAvgPoolDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_uint64,
-    ]
-    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
-    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
-        infiniopAvgPoolDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopAvgPool.restype = c_int32
-    lib.infiniopAvgPool.argtypes = [
-        infiniopAvgPoolDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
-    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
-        infiniopAvgPoolDescriptor_t,
-    ]
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
+    LIBINFINIOP,
-    infiniopTensorDescriptor_t,
+    TestTensor,
-    open_lib,
-    to_tensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto
@@ -34,13 +35,13 @@ _TEST_CASES_ = [
 ]
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
-    torch.float32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
 }
@@ -66,13 +67,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
-class CausalSoftmaxDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
 def causal_softmax(x):
    type = x.dtype
    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
@@ -81,66 +75,57 @@ def causal_softmax(x):
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    x_stride=None,
    y_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
+        f"Testing CausalSoftmax on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )
-    x = torch.rand(shape, dtype=dtype).to(torch_device)
+    x = TestTensor(shape, x_stride, dtype, device)
-    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    ans = causal_softmax(x.torch_tensor())
-    x = torch.where(mask == 1, torch.full_like(x, torch.finfo(x.dtype).max), x)
-    ans = causal_softmax(x)
-    x = rearrange_if_needed(x, x_stride)
-    x_tensor = to_tensor(x, lib)
    if inplace == Inplace.INPLACE_X:
        y = x
-        y_tensor = x_tensor
    else:
-        y = torch.zeros(shape, dtype=dtype).to(torch_device)
+        y = TestTensor(shape, x_stride, dtype, device)
-        y = rearrange_if_needed(y, y_stride)
-        y_tensor = to_tensor(y, lib)
    if sync is not None:
        sync()
-    descriptor = infiniopCausalSoftmaxDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateCausalSoftmaxDescriptor(
+        LIBINFINIOP.infiniopCreateCausalSoftmaxDescriptor(
-            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
        )
    )
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.destroyDesc(lib)
+    x.destroy_desc()
+    y.destroy_desc()
    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetCausalSoftmaxWorkspaceSize(
+        LIBINFINIOP.infiniopGetCausalSoftmaxWorkspaceSize(
            descriptor, ctypes.byref(workspace_size)
        )
    )
-    workspace = create_workspace(workspace_size.value, x.device)
+    workspace = TestWorkspace(workspace_size.value, x.device)
    def lib_causal_softmax():
        check_error(
-            lib.infiniopCausalSoftmax(
+            LIBINFINIOP.infiniopCausalSoftmax(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                y_tensor.data,
+                y.data(),
-                x_tensor.data,
+                x.data(),
                None,
            )
        )
@@ -152,49 +137,21 @@ def test(
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(y, ans, atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
-    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopCausalSoftmaxDescriptor_t),
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
-    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
-        infiniopCausalSoftmaxDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopCausalSoftmax.restype = c_int32
-    lib.infiniopCausalSoftmax.argtypes = [
-        infiniopCausalSoftmaxDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
-    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
-        infiniopCausalSoftmaxDescriptor_t,
-    ]
    # Configure testing options
    DEBUG = args.debug
@@ -203,6 +160,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/clip.py
+++ b/test/infiniop/clip.py
@@ -2,21 +2,22 @@
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
+    LIBINFINIOP,
-    infiniopTensorDescriptor_t,
+    TestTensor,
-    open_lib,
-    to_tensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto
@@ -51,12 +52,12 @@ _TEST_CASES_ = [
 ]
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.float32: {"atol": 1e-7, "rtol": 1e-6},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6},
 }
@@ -82,156 +83,108 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
-class ClipDescriptor(Structure):
+def clip(y, x, min_val, max_val):
-    _fields_ = [("device_type", c_int32), ("device_id", c_int32)]
+    torch.clamp(x, min_val, max_val, out=y)
-infiniopClipDescriptor_t = POINTER(ClipDescriptor)
-def clip(x, min_val, max_val):
-    return torch.clamp(x, min_val, max_val)
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    x_stride=None,
    y_stride=None,
    min_val=-1.0,
    max_val=1.0,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float32,
+    dtype=InfiniDtype.F32,
    sync=None,
 ):
-    print(
+    x = TestTensor(shape, x_stride, dtype, device)
-        f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+    min_ = TestTensor(
-        f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
+        shape, [0 for _ in shape], dtype, device, mode="zeros", bias=min_val
    )
-    x = torch.rand(shape, dtype=dtype).to(torch_device)
+    max_ = TestTensor(
-    ans = clip(x, min_val, max_val)
+        shape, [0 for _ in shape], dtype, device, mode="zeros", bias=max_val
-    x = rearrange_if_needed(x, x_stride)
+    )
-    x_tensor = to_tensor(x, lib)
    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
        y = x
-        y_tensor = x_tensor
    else:
-        y = torch.zeros(shape, dtype=dtype).to(torch_device)
+        y = TestTensor(shape, y_stride, dtype, device)
-        y = rearrange_if_needed(y, y_stride)
-        y_tensor = to_tensor(y, lib)
+    if y.is_broadcast():
+        return
+    print(
+        f"Testing Clip on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"min_val:{min_val} max_val:{max_val} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val)
    if sync is not None:
        sync()
-    descriptor = infiniopClipDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
-    min_, max_ = torch.tensor([min_val], dtype=dtype).to(torch_device), torch.tensor(
-        [max_val], dtype=dtype
-    ).to(torch_device)
-    min_tensor = to_tensor(
-        min_, lib, force_shape=shape, force_strides=[0 for _ in shape]
-    )
-    max_tensor = to_tensor(
-        max_, lib, force_shape=shape, force_strides=[0 for _ in shape]
-    )
    check_error(
-        lib.infiniopCreateClipDescriptor(
+        LIBINFINIOP.infiniopCreateClipDescriptor(
            handle,
            ctypes.byref(descriptor),
-            y_tensor.descriptor,
+            y.descriptor,
-            x_tensor.descriptor,
+            x.descriptor,
-            min_tensor.descriptor,
+            min_.descriptor,
-            max_tensor.descriptor,
+            max_.descriptor,
        )
    )
    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetClipWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetClipWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, x.device)
+    workspace = TestWorkspace(workspace_size.value, x.device)
    def lib_clip():
        check_error(
-            lib.infiniopClip(
+            LIBINFINIOP.infiniopClip(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data() if workspace is not None else None,
                workspace_size.value,
-                y_tensor.data,
+                y.data(),
-                x_tensor.data,
+                x.data(),
-                min_tensor.data,
+                min_.data(),
-                max_tensor.data,
+                max_.data(),
                None,
            )
        )
    lib_clip()
-    # Now we can destroy the tensor descriptors
+    # Destroy the tensor descriptors
-    x_tensor.destroyDesc(lib)
+    for tensor in [x, y, min_, max_]:
-    if inplace != Inplace.INPLACE_X:
+        tensor.destroy_desc()
-        y_tensor.destroyDesc(lib)
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
+    if DEBUG:
-        print("\nExpected:")
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
-        print(ans)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
-        print("\nActual:")
-        print(y)
-        print("\nDifference:")
-        print(torch.abs(y - ans))
-        print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
-        debug(y, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_clip(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyClipDescriptor(descriptor))
 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateClipDescriptor.restype = c_int32
-    lib.infiniopCreateClipDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopClipDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopGetClipWorkspaceSize.restype = c_int32
-    lib.infiniopGetClipWorkspaceSize.argtypes = [
-        infiniopClipDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopClip.restype = c_int32
-    lib.infiniopClip.argtypes = [
-        infiniopClipDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyClipDescriptor.restype = c_int32
-    lib.infiniopDestroyClipDescriptor.argtypes = [
-        infiniopClipDescriptor_t,
-    ]
    # Configure testing options
    DEBUG = args.debug
    PROFILE = args.profile
@@ -239,6 +192,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
-from ctypes import POINTER, Structure, c_int32, c_void_p
-import ctypes
-import sys
-import os
-import time
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-    rearrange_tensor,
-)
-from operatorspy.tests.test_utils import get_args
-import torch
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-class ExpandDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
-def expand(x, y):
-    if PROFILE:
-        ans = x.expand_as(y).clone()
-        torch.cuda.synchronize()
-        return ans
-    return x.expand_as(y)
-def test(
-    lib,
-    handle,
-    torch_device,
-    y_shape,
-    x_shape,
-    y_stride=None,
-    x_stride=None,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
-    )
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
-    if x_stride is not None:
-        x = rearrange_tensor(x, x_stride)
-    if y_stride is not None:
-        y = rearrange_tensor(y, y_stride)
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = expand(x, y)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = expand(x, y)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    if sync is not None:
-        sync()
-    descriptor = infiniopExpandDescriptor_t()
-    check_error(
-        lib.infiniopCreateExpandDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-        )
-    )
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, y_stride, x_stride in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, y_stride, x_stride in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-def test_bang(lib, test_cases):
-    import torch_mlu
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, y_stride, x_stride in test_cases:
-        # fmt: off
-        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-if __name__ == "__main__":
-    test_cases = [
-        # fmt: off
-        # y_shape, x_shape, y_stride, x_stride
-        ((), (), None, None),
-        ((3, 3), (1,), None, None),
-        ((5, 4, 3), (4, 3,), None, (6, 1)),
-        ((99, 111), (111,), None, None),
-        ((2, 4, 3), (1, 3), None, None),
-        ((2, 20, 3), (2, 1, 3), None, None),
-        ((2, 3, 4, 5), (5,), None, None),
-        ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
-        ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
-        # fmt: on
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateExpandDescriptor.restype = c_int32
-    lib.infiniopCreateExpandDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopExpandDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopExpand.restype = c_int32
-    lib.infiniopExpand.argtypes = [
-        infiniopExpandDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyExpandDescriptor.restype = c_int32
-    lib.infiniopDestroyExpandDescriptor.argtypes = [
-        infiniopExpandDescriptor_t,
-    ]
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
+    LIBINFINIOP,
-    infiniopTensorDescriptor_t,
+    TestTensor,
-    open_lib,
-    to_tensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 # ==============================================================================
@@ -31,13 +32,13 @@ _TEST_CASES = [
 ]
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32, torch.bfloat16]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
-    torch.float32: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
-    torch.bfloat16: {"atol": 0, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
 }
 DEBUG = False
@@ -46,16 +47,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
-# ==============================================================================
-#  Definitions
-# ==============================================================================
-class GemmDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-infiniopGemmDescriptor_t = POINTER(GemmDescriptor)
 # PyTorch implementation for matrix multiplication
 def gemm(d, _c, beta, _a, _b, alpha):
    try:
@@ -73,9 +64,8 @@ def gemm(d, _c, beta, _a, _b, alpha):
 # The argument list should be (lib, handle, torch_device, <param list>, dtype)
 # The <param list> should keep the same order as the one specified in _TEST_CASES
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    alpha,
    beta,
    a_shape,
@@ -84,65 +74,71 @@ def test(
    a_stride=None,
    b_stride=None,
    c_stride=None,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
+        f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
        f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
-        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
+        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
    )
    # Initialize tensors
-    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
+    a = TestTensor(a_shape, a_stride, dtype, device)
-    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
+    b = TestTensor(b_shape, b_stride, dtype, device)
-    c = torch.ones(c_shape, dtype=dtype).to(torch_device)
+    c = TestTensor(c_shape, c_stride, dtype, device, mode="ones")
-    ans = torch.zeros(c_shape, dtype=dtype).to(torch_device)
+    ans = TestTensor(c_shape, c_stride, dtype, device, mode="zeros")
    # Compute the PyTorch reference result
-    gemm(ans, c, beta, a, b, alpha)
+    def torch_gemm():
+        gemm(
+            ans.torch_tensor(),
+            c.torch_tensor(),
+            beta,
+            a.torch_tensor(),
+            b.torch_tensor(),
+            alpha,
+        )
-    a, b, c = [
+    torch_gemm()
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
-    ]
-    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
    if sync is not None:
        sync()
-    descriptor = infiniopGemmDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateGemmDescriptor(
+        LIBINFINIOP.infiniopCreateGemmDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
+            c.descriptor,
-            a_tensor.descriptor,
+            a.descriptor,
-            b_tensor.descriptor,
+            b.descriptor,
        )
    )
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
+    for tensor in [a, b, c]:
-        tensor.destroyDesc(lib)
+        tensor.destroy_desc()
    # Get workspace size and create workspace
    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetGemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, a.device)
+    workspace = TestWorkspace(workspace_size.value, device)
    # Execute infiniop gemm operator
    def lib_gemm():
        check_error(
-            lib.infiniopGemm(
+            LIBINFINIOP.infiniopGemm(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                c_tensor.data,
+                c.data(),
-                a_tensor.data,
+                a.data(),
-                b_tensor.data,
+                b.data(),
                alpha,
                beta,
                None,
@@ -155,17 +151,17 @@ def test(
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: gemm(ans, c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyGemmDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyGemmDescriptor(descriptor))
 # ==============================================================================
@@ -173,40 +169,6 @@ def test(
 # ==============================================================================
 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateGemmDescriptor.restype = c_int32
-    lib.infiniopCreateGemmDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopGemmDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopGetGemmWorkspaceSize.restype = c_int32
-    lib.infiniopGetGemmWorkspaceSize.argtypes = [
-        infiniopGemmDescriptor_t,
-        POINTER(c_size_t),
-    ]
-    lib.infiniopGemm.restype = c_int32
-    lib.infiniopGemm.argtypes = [
-        infiniopGemmDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_float,
-        c_float,
-        c_void_p,
-    ]
-    lib.infiniopDestroyGemmDescriptor.restype = c_int32
-    lib.infiniopDestroyGemmDescriptor.argtypes = [
-        infiniopGemmDescriptor_t,
-    ]
    # Configure testing options
    DEBUG = args.debug
@@ -216,6 +178,6 @@ if __name__ == "__main__":
    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
-import ctypes
-import sys
-import os
-import time
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-from operatorspy.tests.test_utils import get_args
-import torch, time
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-class GlobalAvgPoolDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
-def inferShape(x):
-    return x.shape[:2] + (1,) * (x.dim() - 2)
-def globalAvgPool(x):
-    y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
-    if PROFILE:
-        torch.cuda.synchronize()
-    return y.view(*inferShape(x))
-def test(
-    lib,
-    handle,
-    torch_device,
-    x_shape,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
-    )
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = globalAvgPool(x)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = globalAvgPool(x)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    if sync is not None:
-        sync()
-    descriptor = infiniopGlobalAvgPoolDescriptor_t()
-    check_error(
-        lib.infiniopCreateGlobalAvgPoolDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-        )
-    )
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-    workspaceSize = ctypes.c_uint64(0)
-    check_error(
-        lib.infiniopGetGlobalAvgPoolWorkspaceSize(
-            descriptor, ctypes.byref(workspaceSize)
-        )
-    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
-        torch_device
-    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopGlobalAvgPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
-        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopGlobalAvgPool(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape in test_cases:
-        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
-    destroy_handle(lib, handle)
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape in test_cases:
-        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
-    destroy_handle(lib, handle)
-def test_bang(lib, test_cases):
-    import torch_mlu
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape in test_cases:
-        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
-    destroy_handle(lib, handle)
-if __name__ == "__main__":
-    test_cases = [
-        # x_shape
-        ((1, 3, 3)),
-        ((1, 3, 1, 1, 3)),
-        ((1, 3, 1, 1, 257)),
-        ((1, 2, 1, 1, 514)),
-        ((1, 3, 1, 1, 1025)),
-        ((32, 256, 1, 112, 112)),
-        ((2, 3, 2048000)),
-        ((2, 1, 10243)),
-        ((2, 20, 100)),
-        ((3, 33, 333)),
-        ((32, 20, 512)),
-        ((3, 3, 11, 11, 11, 3, 2)),
-        ((32, 256, 1, 112, 112)),
-        ((32, 256, 112, 112)),
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
-    lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopGlobalAvgPoolDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
-    lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
-        infiniopGlobalAvgPoolDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopGlobalAvgPool.restype = c_int32
-    lib.infiniopGlobalAvgPool.argtypes = [
-        infiniopGlobalAvgPoolDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
-    lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
-        infiniopGlobalAvgPoolDescriptor_t,
-    ]
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/libinfiniop/__init__.py
+++ b/test/infiniop/libinfiniop/__init__.py
@@ -4,10 +4,11 @@ import sys
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
 from .liboperators import (
    open_lib,
-    CTensor,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
+    LIBINFINIOP,
 )
 from .devices import *
 from .utils import *
 from .datatypes import *
+from .structs import *
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
@@ -19,3 +19,27 @@ class InfiniDtype:
    C32 = 17
    C64 = 18
    BF16 = 19
+InfiniDtypeNames = {
+    InfiniDtype.INVALID: "INVALID",
+    InfiniDtype.BYTE: "BYTE",
+    InfiniDtype.BOOL: "BOOL",
+    InfiniDtype.I8: "I8",
+    InfiniDtype.I16: "I16",
+    InfiniDtype.I32: "I32",
+    InfiniDtype.I64: "I64",
+    InfiniDtype.U8: "U8",
+    InfiniDtype.U16: "U16",
+    InfiniDtype.U32: "U32",
+    InfiniDtype.U64: "U64",
+    InfiniDtype.F8: "F8",
+    InfiniDtype.F16: "F16",
+    InfiniDtype.F32: "F32",
+    InfiniDtype.F64: "F64",
+    InfiniDtype.C8: "C8",
+    InfiniDtype.C16: "C16",
+    InfiniDtype.C32: "C32",
+    InfiniDtype.C64: "C64",
+    InfiniDtype.BF16: "BF16",
+}
--- a/test/infiniop/libinfiniop/devices.py
+++ b/test/infiniop/libinfiniop/devices.py
@@ -10,8 +10,20 @@ class InfiniDeviceEnum:
    SUGON = 8
+InfiniDeviceNames = {
+    InfiniDeviceEnum.CPU: "CPU",
+    InfiniDeviceEnum.NVIDIA: "NVIDIA",
+    InfiniDeviceEnum.CAMBRICON: "Cambricon",
+    InfiniDeviceEnum.ASCEND: "Ascend",
+    InfiniDeviceEnum.METAX: "Metax",
+    InfiniDeviceEnum.MOORE: "Moore",
+    InfiniDeviceEnum.ILUVATAR: "Iluvatar",
+    InfiniDeviceEnum.KUNLUN: "Kunlun",
+    InfiniDeviceEnum.SUGON: "Sugon",
+}
 # Mapping that maps InfiniDeviceEnum to torch device string
-infiniDeviceEnum_str_map = {
+torch_device_map = {
    InfiniDeviceEnum.CPU: "cpu",
    InfiniDeviceEnum.NVIDIA: "cuda",
    InfiniDeviceEnum.CAMBRICON: "mlu",

--- a/test/infiniop/libinfiniop/liboperators.py
+++ b/test/infiniop/libinfiniop/liboperators.py
 import os
 import platform
 import ctypes
-from ctypes import c_int, c_int64, c_uint64, Structure, POINTER
+from ctypes import c_int, c_int64, c_uint64, POINTER
 from .datatypes import *
 from .devices import *
+from .op_register import OpRegister
 from pathlib import Path
+from .structs import *
-Device = c_int
-Optype = c_int
 INFINI_ROOT = os.getenv("INFINI_ROOT") or str(Path.home() / ".infini")
-class TensorDescriptor(Structure):
-    _fields_ = []
-infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
-class CTensor:
-    def __init__(self, desc, torch_tensor):
-        self.descriptor = desc
-        self.torch_tensor_ = torch_tensor
-        self.data = torch_tensor.data_ptr()
-    def destroyDesc(self, lib_):
-        lib_.infiniopDestroyTensorDescriptor(self.descriptor)
-        self.descriptor = None
-class Handle(Structure):
-    _fields_ = [("device", c_int), ("device_id", c_int)]
-infiniopHandle_t = POINTER(Handle)
 class InfiniLib:
    def __init__(self, librt, libop):
        self.librt = librt
@@ -98,4 +72,9 @@ def open_lib():
    lib.infinirtSetDevice.argtypes = [c_int, c_int]
    lib.infinirtSetDevice.restype = c_int
+    OpRegister.register_lib(lib)
    return lib
+LIBINFINIOP = open_lib()
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
+from .structs import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    infiniopOperatorDescriptor_t,
+)
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+class OpRegister:
+    registry = []
+    @classmethod
+    def operator(cls, op):
+        cls.registry.append(op)
+        return op
+    @classmethod
+    def register_lib(cls, lib):
+        for op in cls.registry:
+            op(lib)
+@OpRegister.operator
+def add_(lib):
+    lib.infiniopCreateAddDescriptor.restype = c_int32
+    lib.infiniopCreateAddDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAddWorkspaceSize.restype = c_int32
+    lib.infiniopGetAddWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAdd.restype = c_int32
+    lib.infiniopAdd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAddDescriptor.restype = c_int32
+    lib.infiniopDestroyAddDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def attention_(lib):
+    lib.infiniopCreateAttentionDescriptor.restype = c_int32
+    lib.infiniopCreateAttentionDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
+    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAttention.restype = c_int32
+    lib.infiniopAttention.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
+    lib.infiniopDestroyAttentionDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def causal_softmax_(lib):
+    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCausalSoftmax.restype = c_int32
+    lib.infiniopCausalSoftmax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def clip_(lib):
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetClipWorkspaceSize.restype = c_int32
+    lib.infiniopGetClipWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def conv_(lib):
+    pass
+@OpRegister.operator
+def gemm_(lib):
+    lib.infiniopCreateGemmDescriptor.restype = c_int32
+    lib.infiniopCreateGemmDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetGemmWorkspaceSize.restype = c_int32
+    lib.infiniopGetGemmWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopGemm.restype = c_int32
+    lib.infiniopGemm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_float,
+        c_float,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGemmDescriptor.restype = c_int32
+    lib.infiniopDestroyGemmDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def mul_(lib):
+    lib.infiniopCreateMulDescriptor.restype = c_int32
+    lib.infiniopCreateMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetMulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMulWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopMul.restype = c_int32
+    lib.infiniopMul.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMulDescriptor.restype = c_int32
+    lib.infiniopDestroyMulDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def random_sample_(lib):
+    lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
+    lib.infiniopCreateRandomSampleDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
+    lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopRandomSample.restype = c_int32
+    lib.infiniopRandomSample.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_size_t,
+        c_void_p,
+        c_float,
+        c_float,
+        c_int32,
+        c_float,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
+    lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def rearrange_(lib):
+    lib.infiniopCreateRearrangeDescriptor.restype = c_int32
+    lib.infiniopCreateRearrangeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRearrange.restype = c_int32
+    lib.infiniopRearrange.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
+    lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+@OpRegister.operator
+def rms_norm_(lib):
+    lib.infiniopCreateRMSNormDescriptor.restype = c_int32
+    lib.infiniopCreateRMSNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+    lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopRMSNorm.restype = c_int32
+    lib.infiniopRMSNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
+    lib.infiniopDestroyRMSNormDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def rope_(lib):
+    lib.infiniopCreateRoPEDescriptor.restype = c_int32
+    lib.infiniopCreateRoPEDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
+    lib.infiniopGetRoPEWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopRoPE.restype = c_int32
+    lib.infiniopRoPE.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRoPEDescriptor.restype = c_int32
+    lib.infiniopDestroyRoPEDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def sub_(lib):
+    lib.infiniopCreateSubDescriptor.restype = c_int32
+    lib.infiniopCreateSubDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSubWorkspaceSize.restype = c_int32
+    lib.infiniopGetSubWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSub.restype = c_int32
+    lib.infiniopSub.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySubDescriptor.restype = c_int32
+    lib.infiniopDestroySubDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+@OpRegister.operator
+def swiglu_(lib):
+    lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
+    lib.infiniopCreateSwiGLUDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
+    lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSwiGLU.restype = c_int32
+    lib.infiniopSwiGLU.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
+    lib.infiniopDestroySwiGLUDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]