Merge branch 'main' into issue/300

0166515c · PanZezhong1725 · GitHub · f0300ff3 · a23c4d13 · f0300ff3
Unverified Commit 0166515c authored Aug 07, 2025 by PanZezhong1725 Committed by GitHub Aug 07, 2025
15 changed files
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
-import ctypes
-import sys
-import os
-import time
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-
-from operatorspy.tests.test_utils import get_args
-import torch
-from typing import Tuple
-
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-class MaxPoolDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor)
-
-
-def pool(x, k, padding, stride, dilation=1):
-    pooling_layers = {
-        1: torch.nn.MaxPool1d,
-        2: torch.nn.MaxPool2d,
-        3: torch.nn.MaxPool3d,
-    }
-
-    ndim = len(x.shape) - 2
-    if ndim not in pooling_layers:
-        print("Error: Pytorch -> Unsupported tensor dimension")
-        return None
-
-    ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x)
-    if PROFILE:
-        torch.cuda.synchronize()
-    return ans
-
-
-def inferShape(x_shape, kernel_shape, padding, strides):
-    assert (
-        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
-    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
-    input_shape = x_shape[2:]
-    output_shape = []
-
-    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
-        output_dim = (dim + 2 * p - k) // s + 1
-        output_shape.append(output_dim)
-
-    return x_shape[:2] + tuple(output_shape)
-
-
-# convert a python tuple to a ctype void pointer
-def tuple_to_void_p(py_tuple: Tuple):
-    array = ctypes.c_int64 * len(py_tuple)
-    data_array = array(*py_tuple)
-    return ctypes.cast(data_array, ctypes.c_void_p)
-
-
-def test(
-    lib,
-    handle,
-    torch_device,
-    x_shape,
-    k_shape,
-    padding,
-    strides,
-    tensor_dtype=torch.float16,
-    sync=None
-):
-    print(
-        f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
-    )
-
-    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(
-        inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
-    ).to(torch_device)
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = pool(x, k_shape, padding, strides)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = pool(x, k_shape, padding, strides)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
-    
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopMaxPoolDescriptor_t()
-    check_error(
-        lib.infiniopCreateMaxPoolDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            tuple_to_void_p(k_shape),
-            tuple_to_void_p(padding),
-            tuple_to_void_p(strides),
-            len(k_shape),
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-
-    workspaceSize = ctypes.c_uint64(0)
-    check_error(
-        lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
-    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
-        torch_device
-    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
-
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopMaxPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
-        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopMaxPool(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape, kernel_shape, padding, strides in test_cases:
-        # fmt: off
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-if __name__ == "__main__":
-    test_cases = [
-        # x_shape, kernel_shape, padding, strides
-        ((1, 1, 10), (3,), (1,), (1,)),
-        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
-        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
-    ]
-    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
-    lib.infiniopCreateMaxPoolDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopMaxPoolDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_uint64,
-    ]
-    lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
-    lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
-        infiniopMaxPoolDescriptor_t,
-        POINTER(c_uint64),
-    ]
-    lib.infiniopMaxPool.restype = c_int32
-    lib.infiniopMaxPool.argtypes = [
-        infiniopMaxPoolDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
-    lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
-        infiniopMaxPoolDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/mlp.py
+++ b/test/infiniop/mlp.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
-import ctypes
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    CTensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-    rearrange_tensor,
-    create_workspace,
-)
-
-from operatorspy.tests.test_utils import get_args
-import torch
-import torch.nn as nn
-
-
-class MLPDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopMLPDescriptor_t = POINTER(MLPDescriptor)
-
-
-def swiglu(a, b):
-    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
-
-
-def mlp(y, x, w12, w3, alpha, residual):
-    input_dtype = x.dtype
-
-    intermediate_size = w3.shape[0]
-
-    a = torch.matmul(
-        x.to(torch.float32), w12[:, intermediate_size:].to(torch.float32)
-    ).to(input_dtype)
-    b = torch.matmul(
-        x.to(torch.float32), w12[:, 0:intermediate_size].to(torch.float32)
-    ).to(input_dtype)
-    c = swiglu(a, b)
-    d = torch.matmul(c.to(torch.float32), alpha * w3.to(torch.float32)).to(input_dtype)
-    out = d + y if residual else d
-    return out
-
-
-def test(
-    lib,
-    handle,
-    torch_device,
-    num_tokens,
-    hidden_size,
-    intermediate_size,
-    alpha,
-    residual,
-    dtype=torch.float16,
-    x_stride=None,
-    y_stride=None,
-    w12_stride=None,
-    w3_stride=None,
-    sync=None
-):
-    print(
-        f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
-        f" alpha:{alpha} residual:{residual} dtype:{dtype} x_stride:{x_stride} y_stride:{y_stride} w12_stride:{w12_stride} w3_stride:{w3_stride}"
-    )
-
-    y = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
-    x = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
-    w12 = (
-        torch.rand([hidden_size, 2 * intermediate_size], dtype=dtype).to(torch_device)
-        * 0.01
-    )
-    w3 = (
-        torch.rand([intermediate_size, hidden_size], dtype=dtype).to(torch_device)
-        * 0.01
-    )
-
-    ans = mlp(y, x, w12, w3, alpha, residual)
-
-    if x_stride is not None:
-        x = rearrange_tensor(x, x_stride)
-    if y_stride is not None:
-        y = rearrange_tensor(y, y_stride)
-    if w12_stride is not None:
-        w12 = rearrange_tensor(w12, w12_stride)
-    if w3_stride is not None:
-        w3 = rearrange_tensor(w3, w3_stride)
-
-    y_tensor = to_tensor(y, lib)
-    x_tensor = to_tensor(x, lib)
-    w12_tensor = to_tensor(w12, lib)
-    w3_tensor = to_tensor(w3, lib)
-    
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopMLPDescriptor_t()
-    check_error(
-        lib.infiniopCreateMLPDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            w12_tensor.descriptor,
-            w3_tensor.descriptor,
-            alpha,
-            residual,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    y_tensor.descriptor.contents.invalidate()
-    x_tensor.descriptor.contents.invalidate()
-    w12_tensor.descriptor.contents.invalidate()
-    w3_tensor.descriptor.contents.invalidate()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        lib.infiniopGetMLPWorkspaceSize(descriptor, ctypes.byref(workspace_size))
-    )
-    workspace = create_workspace(workspace_size.value, x.device)
-
-    check_error(
-        lib.infiniopMLP(
-            descriptor,
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            y_tensor.data,
-            x_tensor.data,
-            w12_tensor.data,
-            w3_tensor.data,
-            None,
-        )
-    )
-    assert torch.allclose(y, ans, atol=0, rtol=2e-2)
-
-    check_error(lib.infiniopDestroyMLPDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-
-    for (
-        num_tokens,
-        hidden_size,
-        intermediate_size,
-        alpha,
-        residual,
-        dtype,
-        x_stride,
-        y_stride,
-        w12_stride,
-        w3_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cpu",
-            num_tokens,
-            hidden_size,
-            intermediate_size,
-            alpha,
-            residual,
-            dtype,
-            x_stride,
-            y_stride,
-            w12_stride,
-            w3_stride,
-        )
-
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-
-    for (
-        num_tokens,
-        hidden_size,
-        intermediate_size,
-        alpha,
-        residual,
-        dtype,
-        x_stride,
-        y_stride,
-        w12_stride,
-        w3_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cuda",
-            num_tokens,
-            hidden_size,
-            intermediate_size,
-            alpha,
-            residual,
-            dtype,
-            x_stride,
-            y_stride,
-            w12_stride,
-            w3_stride,
-        )
-
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-
-    for (
-        num_tokens,
-        hidden_size,
-        intermediate_size,
-        alpha,
-        residual,
-        dtype,
-        x_stride,
-        y_stride,
-        w12_stride,
-        w3_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "mlu",
-            num_tokens,
-            hidden_size,
-            intermediate_size,
-            alpha,
-            residual,
-            dtype,
-            x_stride,
-            y_stride,
-            w12_stride,
-            w3_stride,
-        )
-
-    destroy_handle(lib, handle)
-
-
-if __name__ == "__main__":
-    test_cases = [
-        # num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride
-        (4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None),
-        (4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None),
-        (
-            4,
-            4096,
-            11008,
-            1.0,
-            True,
-            torch.float16,
-            None,
-            None,
-            [1, 4096],
-            [1, 11008],
-        ),
-        (4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None),
-        (4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None),
-    ]
-    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateMLPDescriptor.restype = c_int32
-    lib.infiniopCreateMLPDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopMLPDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_float,
-        c_bool,
-    ]
-
-    lib.infiniopGetMLPWorkspaceSize.restype = c_int32
-    lib.infiniopGetMLPWorkspaceSize.argtypes = [
-        infiniopMLPDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopMLP.restype = c_int32
-    lib.infiniopMLP.argtypes = [
-        infiniopMLPDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyMLPDescriptor.restype = c_int32
-    lib.infiniopDestroyMLPDescriptor.argtypes = [
-        infiniopMLPDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/mul.py
+++ b/test/infiniop/mul.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
-    create_workspace,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -58,126 +59,93 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
 }

-
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class MulDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopMulDescriptor_t = POINTER(MulDescriptor)
-
-
-def mul(x, y):
-    return torch.mul(x, y)
-
-
-def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
-    """
-    rearrange the tensors if needed and apply the inplace config.
-    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
-    the inplace config is ignored and out-of-place is used
-    """
-    original_c_strides = c_strides if c_strides else c.stride()
-
-    def _rearrange(tensor, strides):
-        if strides and 0 in strides:
-            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
-            return tensor
-        else:
-            return rearrange_if_needed(tensor, strides)
-
-    a, b, c = [
-        _rearrange(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
-    ]
-    c = (
-        c
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
-    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
-    if 0 in c.stride():
-        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
-
-    return a, b, c
+def mul(c, a, b):
+    torch.mul(a, b, out=c)


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    a_stride=None,
    b_stride=None,
    c_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
    print(
-        f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{dtype} inplace:{inplace}"
+        f"Testing Mul on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )
+    mul(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())

-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
-    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
-
-    ans = mul(a, b)
-
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-    c_tensor = (
-        to_tensor(c, lib)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
-    )
    if sync is not None:
        sync()

-    descriptor = infiniopMulDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateMulDescriptor(
+        LIBINFINIOP.infiniopCreateMulDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetMulWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, c.device)
+    workspace = TestWorkspace(workspace_size.value, c.device)

    def lib_mul():
        check_error(
-            lib.infiniopMul(
+            LIBINFINIOP.infiniopMul(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                c_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
+                c.data(),
+                a.data(),
+                b.data(),
                None,
            )
        )
@@ -186,52 +154,20 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: mul(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_mul(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyMulDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyMulDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateMulDescriptor.restype = c_int32
-    lib.infiniopCreateMulDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopMulDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetMulWorkspaceSize.restype = c_int32
-    lib.infiniopGetMulWorkspaceSize.argtypes = [
-        infiniopMulDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopMul.restype = c_int32
-    lib.infiniopMul.argtypes = [
-        infiniopMulDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyMulDescriptor.restype = c_int32
-    lib.infiniopDestroyMulDescriptor.argtypes = [
-        infiniopMulDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -240,7 +176,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations

    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
-
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    InfiniDtype,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    create_workspace,
    test_operator,
    get_args,
    debug_all,
    get_tolerance,
    profile_operation,
-    synchronize_device,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )

 # ==============================================================================
@@ -37,11 +37,11 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.bfloat16]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]

 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 0},
-    torch.bfloat16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
 }


@@ -51,13 +51,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class RandomSampleDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
-
-
 def random_sample(data, random_val, topp, topk, voc, temperature):
    if topp > 0 and topk > 1:
        sorted_vals, sorted_indices = torch.sort(data, descending=True)
@@ -68,81 +61,83 @@ def random_sample(data, random_val, topp, topk, voc, temperature):

        k_index = min(topk, voc) - 1
        threshold = min(cum_probs[k_index], topp) * random_val
-        
+
        try:
            idx = torch.searchsorted(cum_probs, threshold)
        except Exception:
            # Fallback for manual search if torch.searchsorted is not supported
            indices = (cum_probs >= threshold).nonzero(as_tuple=True)[0]
-            idx = indices[0] if indices.numel() > 0 else torch.tensor(len(cum_probs)-1, device=cum_probs.device)
+            idx = (
+                indices[0]
+                if indices.numel() > 0
+                else torch.tensor(len(cum_probs) - 1, device=cum_probs.device)
+            )
        return sorted_indices[idx]

    return torch.argmax(data)


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    voc,
    random_val,
    topp,
    topk,
    temperature,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
    print(
-        f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
+        f"Testing RandomSample on {InfiniDeviceNames[device]} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{InfiniDtypeNames[dtype]}"
    )

-    data = torch.arange(voc).float() * 0.0001
    _perm = torch.randperm(voc)
-    data = data[_perm].to(dtype).to(torch_device)
+    logits = TestTensor.from_torch(
+        torch.arange(voc)[_perm].float() * 0.0001, dtype, device
+    )

    ans = random_sample(
-        data, random_val, topp, topk, voc, temperature
+        logits.torch_tensor(), random_val, topp, topk, voc, temperature
+    ).to(
+        torch.int32
    )  # 这个函数在device速度可能会很慢，可以通过data.to("cpu")方式加快计算过程

-    indices = torch.zeros([], dtype=torch.int64).to(torch_device)
-
-    x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]
-
-    indices_tensor.descriptor.contents.dt = InfiniDtype.U64  # treat int64 as uint64
+    indices = TestTensor([], None, InfiniDtype.I32, device, mode="zeros")

    if sync is not None:
        sync()

-    descriptor = infiniopRandomSampleDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateRandomSampleDescriptor(
+        LIBINFINIOP.infiniopCreateRandomSampleDescriptor(
            handle,
            ctypes.byref(descriptor),
-            indices_tensor.descriptor,
-            x_tensor.descriptor,
+            indices.descriptor,
+            logits.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x_tensor, indices_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [logits, indices]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetRandomSampleWorkspaceSize(
+        LIBINFINIOP.infiniopGetRandomSampleWorkspaceSize(
            descriptor, ctypes.byref(workspace_size)
        )
    )
-    workspace = create_workspace(workspace_size.value, torch_device)
+    workspace = TestWorkspace(workspace_size.value, device)

    def lib_random_sample():
        check_error(
-            lib.infiniopRandomSample(
+            LIBINFINIOP.infiniopRandomSample(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                indices_tensor.data,
-                x_tensor.data,
+                indices.data(),
+                logits.data(),
                random_val,
                topp,
                topk,
@@ -153,66 +148,36 @@ def test(

    lib_random_sample()

-    if torch_device == "npu":
-        synchronize_device(torch_device)
+    if sync is not None:
+        sync()

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
        debug_all(
-            (indices.type(ans.dtype), data[indices]),
-            (ans, data[ans]),
+            (indices.actual_tensor(), logits.actual_tensor()[indices.actual_tensor()]),
+            (ans, logits.torch_tensor()[ans]),
            "or",
            atol=atol,
            rtol=rtol,
        )
-    assert indices.type(ans.dtype) == ans or data[ans] == data[indices]
+    assert (
+        indices.actual_tensor() == ans
+        or logits.actual_tensor()[indices.actual_tensor()] == logits.torch_tensor()[ans]
+    )

    # Profiling workflow
    if PROFILE:
        # fmt: off
        profile_operation("PyTorch", lambda: random_sample(
-                data, random_val, topp, topk, voc, temperature
-            ), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_random_sample(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+            logits.torch_tensor(), random_val, topp, topk, voc, temperature
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_random_sample(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyRandomSampleDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
-    lib.infiniopCreateRandomSampleDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopRandomSampleDescriptor_t),
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
-    lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
-        infiniopRandomSampleDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopRandomSample.restype = c_int32
-    lib.infiniopRandomSample.argtypes = [
-        infiniopRandomSampleDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_uint64,
-        c_void_p,
-        c_float,
-        c_float,
-        c_int32,
-        c_float,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
-    lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
-        infiniopRandomSampleDescriptor_t,
-    ]

    DEBUG = args.debug
    PROFILE = args.profile
@@ -221,6 +186,6 @@ if __name__ == "__main__":

    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    rearrange_tensor,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )

+
 def row_major_strides(shape):
    """生成张量的行优先(C风格)stride
-    
+
    Args:
        shape: 张量形状
-    
+
    Returns:
        行优先strides列表
    """
@@ -34,12 +34,13 @@ def row_major_strides(shape):
        strides.insert(0, stride)
    return strides

+
 def column_major_strides(shape):
    """生成张量的列优先(Fortran风格)stride
-    
+
    Args:
        shape: 张量形状
-    
+
    Returns:
        列优先strides列表
    """
@@ -52,62 +53,37 @@ def column_major_strides(shape):
    return strides


-
 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES = [
    # (shape, x_stride, y_stride)
+    ((100, 100), (1, 100), (100, 1)),  # shape  # x_stride  # y_stride
+    ((4, 4), (1, 4), (4, 1)),  # shape  # x_stride  # y_stride
+    ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)),  # shape  # x_stride  # y_stride
+    ((2000, 2000), (1, 2000), (2000, 1)),  # shape  # x_stride  # y_stride
+    ((2001, 2001), (1, 2001), (2001, 1)),  # shape  # x_stride  # y_stride
+    ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)),  # shape  # x_stride  # y_stride
    (
-        (100, 100),  # shape
-        (1, 100),    # x_stride
-        (100, 1)     # y_stride
-    ),
-    (
-        (4, 4),      # shape
-        (1, 4),      # x_stride
-        (4, 1)       # y_stride
-    ),
-    (
-        (4, 6, 64),  # shape
-        (64, 4*64, 1), # x_stride
-        (6*64, 64, 1)  # y_stride
-    ),
-    (
-        (2000, 2000), # shape
-        (1, 2000),    # x_stride
-        (2000, 1)     # y_stride
+        (3, 4, 7, 53, 9),  # shape
+        row_major_strides((3, 4, 7, 53, 9)),  # x_stride
+        column_major_strides((3, 4, 7, 53, 9)),  # y_stride
    ),
    (
-        (2001, 2001), # shape
-        (1, 2001),    # x_stride
-        (2001, 1)     # y_stride
-    ),
-    (
-        (2, 2, 2, 4), # shape
-        (16, 8, 4, 1), # x_stride
-        (16, 8, 1, 2)  # y_stride
-    ),
-    (
-        (3, 4, 7, 53, 9), # shape
-        row_major_strides((3, 4, 7, 53, 9)), # x_stride
-        column_major_strides((3, 4, 7, 53, 9)) # y_stride
-    ),
-    (
-        (3, 4, 50, 50, 5, 7), # shape
+        (3, 4, 50, 50, 5, 7),  # shape
        row_major_strides((3, 4, 50, 50, 5, 7)),  # x_stride
-        column_major_strides((3, 4, 50, 50, 5, 7)) # y_stride
+        column_major_strides((3, 4, 50, 50, 5, 7)),  # y_stride
    ),
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 0},
-    torch.float32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
 }

 DEBUG = False
@@ -116,106 +92,60 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class RearrangeDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopRearrangeDescriptor_t = POINTER(RearrangeDescriptor)
-
-
-def rearrange_torch(x, x_shape, y_stride):
-    y_ = x.clone()
-    y_.set_(y_.untyped_storage(), 0, x_shape, y_stride)
-    y_[:] = x.view_as(y_)
-    return y_
+def rearrange_torch(y, x, x_shape, y_stride):
+    y.set_(y.untyped_storage(), 0, x_shape, y_stride)
+    y[:] = x.view_as(y)


 def test(
-    lib,
-    handle,
-    torch_device,
-    shape,
-    x_stride,
-    y_stride,
-    dtype=torch.float16,
-    sync=None
+    handle, torch_device, shape, x_stride, y_stride, dtype=InfiniDtype.F16, sync=None
 ):
    print(
-        f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
+        f"Testing Rerrange on {InfiniDeviceNames[torch_device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]}"
    )

-    x = torch.rand(shape, dtype=dtype).to(torch_device)
-    y = torch.zeros(shape, dtype=dtype).to(torch_device)
+    x = TestTensor(shape, x_stride, dtype, device)
+    y = TestTensor(shape, y_stride, dtype, device, mode="ones")

-    rearrange_torch(x, shape, y_stride)
+    rearrange_torch(y.torch_tensor(), x.torch_tensor(), shape, y_stride)

-    x, y = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([x, y], [x_stride, y_stride])
-    ]
-
-    x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
-    
    if sync is not None:
        sync()

-    descriptor = infiniopRearrangeDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateRearrangeDescriptor(
-            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        LIBINFINIOP.infiniopCreateRearrangeDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x_tensor, y_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [x, y]:
+        tensor.destroy_desc()

    def lib_rearrange():
-        check_error(
-            lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
-        )
+        check_error(LIBINFINIOP.infiniopRearrange(descriptor, y.data(), x.data(), None))

    lib_rearrange()

    # Validate results
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(x, y, atol=atol, rtol=rtol)
-    assert torch.allclose(x, y, atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: rearrange_torch(x, shape, y_stride), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_rearrange(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: rearrange_torch(y.torch_tensor(), x.torch_tensor(), shape, y_stride), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_rearrange(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on

-    check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyRearrangeDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateRearrangeDescriptor.restype = c_int32
-    lib.infiniopCreateRearrangeDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopRearrangeDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopRearrange.restype = c_int32
-    lib.infiniopRearrange.argtypes = [
-        infiniopRearrangeDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
-    lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
-
    # Configure testing options
    DEBUG = args.debug
    PROFILE = args.profile
@@ -224,6 +154,6 @@ if __name__ == "__main__":

    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
-from ctypes import POINTER, Structure, c_int32, c_void_p
 import ctypes
-import sys
-import os
-import time
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-
-from operatorspy.tests.test_utils import get_args
+from ctypes import c_uint64
 from enum import Enum, auto
+
 import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)

-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]


 class Inplace(Enum):
@@ -33,160 +42,121 @@ class Inplace(Enum):
    INPLACE_X = auto()


-class ReluDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]

+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]

-infiniopReluDescriptor_t = POINTER(ReluDescriptor)
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000


 def relu(x):
-    if PROFILE:
-        ans = torch.nn.functional.relu(x).to(x.dtype)
-        torch.cuda.synchronize()
-        return ans
    return torch.nn.functional.relu(x).to(x.dtype)


 def test(
-    lib,
-    handle,
-    torch_device,
-    tensor_shape,
-    tensor_dtype=torch.float16,
-    inplace=Inplace.OUT_OF_PLACE,
-    sync=None
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
 ):
-    print(
-        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
    )

-    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
-    y = (
-        torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device)
-        if inplace == Inplace.OUT_OF_PLACE
-        else x
-    )
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)

-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = relu(x)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = relu(x)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    if y.is_broadcast():
+        return

-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
+    print(
+        f"Testing Relu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = relu(x.torch_tensor())

    if sync is not None:
-        sync()    
+        sync()

-    descriptor = infiniopReluDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateReluDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
+        LIBINFINIOP.infiniopCreateReluDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
+    for tensor in [x, y]:
+        tensor.destroy_desc()

-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyReluDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for tensor_shape, inplace in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)

-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for tensor_shape, inplace in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
+    def lib_relu():
+        LIBINFINIOP.infiniopRelu(
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+        )

+    lib_relu()

-def test_bang(lib, test_cases):
-    import torch_mlu
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)

-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for tensor_shape, inplace in test_cases:
+    # Profiling workflow
+    if PROFILE:
        # fmt: off
-        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        profile_operation("PyTorch", lambda: relu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_relu(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    destroy_handle(lib, handle)
+
+    check_error(LIBINFINIOP.infiniopDestroyReluDescriptor(descriptor))


 if __name__ == "__main__":
-    test_cases = [
-        # tensor_shape, inplace
-        ((), Inplace.OUT_OF_PLACE),
-        ((), Inplace.INPLACE_X),
-        ((1, 3), Inplace.OUT_OF_PLACE),
-        ((3, 3), Inplace.OUT_OF_PLACE),
-        ((3, 3, 13, 9, 17), Inplace.INPLACE_X),
-        ((32, 20, 512), Inplace.INPLACE_X),
-        ((33, 333, 333), Inplace.OUT_OF_PLACE),
-        ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-    ]
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateReluDescriptor.restype = c_int32
-    lib.infiniopCreateReluDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopReluDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-    lib.infiniopRelu.restype = c_int32
-    lib.infiniopRelu.argtypes = [
-        infiniopReluDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyReluDescriptor.restype = c_int32
-    lib.infiniopDestroyReluDescriptor.argtypes = [
-        infiniopReluDescriptor_t,
-    ]
-
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
-import ctypes
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )

 # ==============================================================================
@@ -33,23 +32,21 @@ _TEST_CASES_ = [
    ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1)),
 ]

-# w (weight) types 
+# w (weight) types
 # Note: 'None' means the same as input dtype
-_WEIGHT_DTYPES = [None, torch.float32]
+_WEIGHT_DTYPES = [None, InfiniDtype.F32]
 # x types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.bfloat16]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]

 # Form the test cases by appending each element of _WEIGHT_DTYPES to each tuple in _TEST_CASES_
 _TEST_CASES = [
-    test_case + (w_dtype,)
-    for test_case in _TEST_CASES_
-    for w_dtype in _WEIGHT_DTYPES
+    test_case + (w_dtype,) for test_case in _TEST_CASES_ for w_dtype in _WEIGHT_DTYPES
 ]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 2e-3, "rtol": 2e-3},
-    torch.bfloat16: {"atol": 8e-3, "rtol": 8e-3},
+    InfiniDtype.F16: {"atol": 2e-3, "rtol": 2e-3},
+    InfiniDtype.BF16: {"atol": 8e-3, "rtol": 8e-3},
 }

 DEBUG = False
@@ -58,13 +55,6 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class RMSNormDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
-
-
 def rms_norm(ans, x, w, eps):
    torch.pow(x, 2, out=ans)
    mean = torch.mean(ans, dim=-1, keepdim=True)
@@ -75,73 +65,67 @@ def rms_norm(ans, x, w, eps):


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    y_shape,
    x_shape,
    w_shape,
    y_stride,
    x_stride,
-    w_dtype=torch.float16,
-    dtype=torch.float16,
+    w_dtype=InfiniDtype.F32,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
+    w_dtype = w_dtype if w_dtype else dtype
    print(
-        f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
-        f" y_stride:{y_stride} x_stride:{x_stride} w_dtype:{w_dtype} dtype:{dtype}"
+        f"Testing RMS_Norm on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} w_dtype:{InfiniDtypeNames[w_dtype]} dtype:{InfiniDtypeNames[dtype]}"
    )

-    w_dtype = w_dtype if w_dtype else dtype
-    y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
-    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
-    w = torch.rand(w_shape, dtype=w_dtype).to(torch_device)
-    ans = torch.zeros(y_shape, dtype=dtype).to(torch_device)
-
-    eps = 1e-5
-    rms_norm(ans, x, w, eps)
+    y = TestTensor(y_shape, y_stride, dtype, device, mode="ones")
+    x = TestTensor(x_shape, x_stride, dtype, device, scale=0.01)
+    w = TestTensor(w_shape, None, w_dtype, device)

-    x, y = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([x, y], [x_stride, y_stride])
-    ]
-    x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
+    eps = 1e-6
+    rms_norm(y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), eps)

    if sync is not None:
        sync()

-    descriptor = infiniopRMSNormDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()

    check_error(
-        lib.infiniopCreateRMSNormDescriptor(
+        LIBINFINIOP.infiniopCreateRMSNormDescriptor(
            handle,
            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            w_tensor.descriptor,
+            y.descriptor,
+            x.descriptor,
+            w.descriptor,
            eps,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x_tensor, y_tensor, w_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [x, y, w]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetRMSNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, y.device)
+    workspace = TestWorkspace(workspace_size.value, y.device)

    def lib_rms_norm():
        check_error(
-            lib.infiniopRMSNorm(
+            LIBINFINIOP.infiniopRMSNorm(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                y_tensor.data,
-                x_tensor.data,
-                w_tensor.data,
+                y.data(),
+                x.data(),
+                w.data(),
                None,
            )
        )
@@ -150,53 +134,20 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(y, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: rms_norm(ans, x, w, eps), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_rms_norm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: rms_norm(y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), eps), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_rms_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyRMSNormDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateRMSNormDescriptor.restype = c_int32
-    lib.infiniopCreateRMSNormDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopRMSNormDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_float,
-    ]
-
-    lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
-    lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
-        infiniopRMSNormDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopRMSNorm.restype = c_int32
-    lib.infiniopRMSNorm.argtypes = [
-        infiniopRMSNormDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
-    lib.infiniopDestroyRMSNormDescriptor.argtypes = [
-        infiniopRMSNormDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -206,6 +157,6 @@ if __name__ == "__main__":

    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/rope.py
+++ b/test/infiniop/rope.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
-    create_workspace,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
-    synchronize_device,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -35,13 +36,13 @@ _TEST_CASES_ = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
-    torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
-    torch.float32: {"atol": 1e-4, "rtol": 1e-3},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-3},
 }


@@ -67,14 +68,7 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class RoPEDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)
-
-
-def rotary_embedding(t, sin, cos, torch_device):
+def rotary_embedding(ans, t, sin, cos, device):
    dh = t.shape[2]
    dt = t.dtype
    assert dh % 2 == 0, "Embedding dimension must be even."
@@ -82,7 +76,7 @@ def rotary_embedding(t, sin, cos, torch_device):
    t_odd = t[..., 1::2]  # [seq_len, n_head, dh // 2]
    cos = cos.unsqueeze(1)  # [seq_len, 1, dh // 2]
    sin = sin.unsqueeze(1)  # [seq_len, 1, dh // 2]
-    if torch_device == "cpu":
+    if device == InfiniDeviceEnum.CPU:
        (t_even, t_odd, cos, sin) = (
            t_even.float(),
            t_odd.float(),
@@ -93,26 +87,23 @@ def rotary_embedding(t, sin, cos, torch_device):
    t_out_even = t_even * cos - t_odd * sin
    t_out_odd = t_even * sin + t_odd * cos

-    t_out = torch.empty_like(t)
-    t_out[..., 0::2] = t_out_even
-    t_out[..., 1::2] = t_out_odd
-
-    return t_out.to(dt).to(torch_device)
+    ans[..., 0::2] = t_out_even.to(dt)
+    ans[..., 1::2] = t_out_odd.to(dt)


-def sin_cos_table(pos, dim, torch_device, theta, dtype):
+def sin_cos_table(pos, dim, device, theta, dtype):
    assert dim % 2 == 0, "Embedding dimension must be even."
-    freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
-        torch_device
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    angles = torch.outer(pos.cpu(), freqs)
+    return (
+        TestTensor.from_torch(torch.sin(angles), dtype, device),
+        TestTensor.from_torch(torch.cos(angles), dtype, device),
    )
-    angles = torch.outer(pos, freqs)
-    return torch.sin(angles).to(dtype), torch.cos(angles).to(dtype)


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    x_strides=None,
    y_strides=None,
@@ -120,71 +111,71 @@ def test(
    dtype=torch.float32,
    sync=None,
 ):
+    x = TestTensor(shape, x_strides, dtype, device)
    if inplace == Inplace.INPLACE_X:
-        y_strides = x_strides
-    print(
-        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{dtype} inplace:{inplace}"
-    )
-
-    x = torch.rand(shape, dtype=dtype).to(torch_device)
-    x = rearrange_if_needed(x, x_strides)
-    if inplace == Inplace.INPLACE_X:
+        if x_strides != y_strides:
+            return
        y = x
    else:
-        y = torch.rand(shape, dtype=dtype).to(torch_device)
-        y = rearrange_if_needed(y, y_strides)
+        y = TestTensor(shape, y_strides, dtype, device)
+
+    print(
+        f"Testing Rotary Positional Embedding on {InfiniDeviceNames[device]} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
    theta = 1e5
-    pos = torch.arange(0, x.shape[0], dtype=torch.int32).to(torch_device)
-    sin_table, cos_table = sin_cos_table(pos, x.shape[2], x.device, theta, dtype)
+    pos = TestTensor.from_torch(torch.arange(0, x.shape[0]), InfiniDtype.I32, device)
+    sin_table, cos_table = sin_cos_table(
+        pos.torch_tensor(), x.shape[2], x.device, theta, dtype
+    )

-    ans = rotary_embedding(x, sin_table, cos_table, torch_device)
+    rotary_embedding(
+        y.torch_tensor(),
+        x.torch_tensor(),
+        sin_table.torch_tensor(),
+        cos_table.torch_tensor(),
+        device,
+    )

-    descriptor = infiniopRoPEDescriptor_t()
-    x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor = [
-        to_tensor(tensor, lib, force_unsigned=True)
-        for tensor in [x, pos, sin_table, cos_table]
-    ]
-    if inplace == Inplace.INPLACE_X:
-        y_tensor = x_tensor
-    else:
-        y_tensor = to_tensor(y, lib)
+    descriptor = infiniopOperatorDescriptor_t()

    if sync is not None:
        sync()

    check_error(
-        lib.infiniopCreateRoPEDescriptor(
+        LIBINFINIOP.infiniopCreateRoPEDescriptor(
            handle,
            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            pos_tensor.descriptor,
-            sin_table_tensor.descriptor,
-            cos_table_tensor.descriptor,
+            y.descriptor,
+            x.descriptor,
+            pos.descriptor,
+            sin_table.descriptor,
+            cos_table.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [y_tensor, x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [y, x, pos, sin_table, cos_table]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetRoPEWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, x.device)
+    workspace = TestWorkspace(workspace_size.value, x.device)

    def lib_rope():
        check_error(
-            lib.infiniopRoPE(
+            LIBINFINIOP.infiniopRoPE(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                y_tensor.data,
-                x_tensor.data,
-                pos_tensor.data,
-                sin_table_tensor.data,
-                cos_table_tensor.data,
+                y.data(),
+                x.data(),
+                pos.data(),
+                sin_table.data(),
+                cos_table.data(),
                None,
            )
        )
@@ -196,60 +187,32 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(y, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)

    if PROFILE:
        profile_operation(
            "PyTorch",
-            lambda: rotary_embedding(x, sin_table, cos_table, torch_device),
-            torch_device,
+            lambda: rotary_embedding(
+                y.torch_tensor(),
+                x.torch_tensor(),
+                sin_table.torch_tensor(),
+                cos_table.torch_tensor(),
+                device,
+            ),
+            device,
            NUM_PRERUN,
            NUM_ITERATIONS,
        )
        profile_operation(
-            "    lib", lambda: lib_rope(), torch_device, NUM_PRERUN, NUM_ITERATIONS
+            "    lib", lambda: lib_rope(), device, NUM_PRERUN, NUM_ITERATIONS
        )

-    check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroyRoPEDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateRoPEDescriptor.restype = c_int32
-    lib.infiniopCreateRoPEDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopRoPEDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
-    lib.infiniopGetRoPEWorkspaceSize.argtypes = [
-        infiniopRoPEDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopRoPE.restype = c_int32
-    lib.infiniopRoPE.argtypes = [
-        infiniopRoPEDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyRoPEDescriptor.restype = c_int32
-    lib.infiniopDestroyRoPEDescriptor.argtypes = [
-        infiniopRoPEDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -259,6 +222,6 @@ if __name__ == "__main__":

    # Execute tests
    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/sub.py
+++ b/test/infiniop/sub.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
-    create_workspace,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -58,12 +59,13 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
 }

 DEBUG = False
@@ -72,111 +74,78 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class SubDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopSubDescriptor_t = POINTER(SubDescriptor)
-
-
-def sub(x, y):
-    return torch.sub(x, y)
-
-
-def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
-    """
-    rearrange the tensors if needed and apply the inplace config.
-    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
-    the inplace config is ignored and out-of-place is used
-    """
-    original_c_strides = c_strides if c_strides else c.stride()
-
-    def _rearrange(tensor, strides):
-        if strides and 0 in strides:
-            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
-            return tensor
-        else:
-            return rearrange_if_needed(tensor, strides)
-
-    a, b, c = [
-        _rearrange(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
-    ]
-    c = (
-        c
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
-    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
-    if 0 in c.stride():
-        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
-
-    return a, b, c
+def sub(c, a, b):
+    return torch.sub(a, b, out=c)


 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    a_stride=None,
    b_stride=None,
    c_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
    print(
-        f"Testing Sub on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{dtype} inplace:{inplace}"
+        f"Testing Sub on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )
+    sub(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())

-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
-    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
-
-    ans = sub(a, b)
-
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-    c_tensor = (
-        to_tensor(c, lib)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
-    )
    if sync is not None:
        sync()

-    descriptor = infiniopSubDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateSubDescriptor(
+        LIBINFINIOP.infiniopCreateSubDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetSubWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetSubWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, c.device)
+    workspace = TestWorkspace(workspace_size.value, device)

    def lib_sub():
        check_error(
-            lib.infiniopSub(
+            LIBINFINIOP.infiniopSub(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                c_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
+                c.data(),
+                a.data(),
+                b.data(),
                None,
            )
        )
@@ -185,53 +154,20 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: sub(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_sub(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: sub(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sub(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroySubDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroySubDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateSubDescriptor.restype = c_int32
-    lib.infiniopCreateSubDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopSubDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetSubWorkspaceSize.restype = c_int32
-    lib.infiniopGetSubWorkspaceSize.argtypes = [
-        infiniopSubDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopSub.restype = c_int32
-    lib.infiniopSub.argtypes = [
-        infiniopSubDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroySubDescriptor.restype = c_int32
-    lib.infiniopDestroySubDescriptor.argtypes = [
-        infiniopSubDescriptor_t,
-    ]
-
    # Configure testing options
    DEBUG = args.debug
    PROFILE = args.profile
@@ -239,6 +175,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations

    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from ctypes import c_uint64
 from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
+    LIBINFINIOP,
+    TestTensor,
    get_test_devices,
    check_error,
-    rearrange_if_needed,
    test_operator,
    get_args,
    debug,
    get_tolerance,
    profile_operation,
-    create_workspace,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto

@@ -58,13 +59,13 @@ _TEST_CASES = [
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
-    torch.bfloat16: {"atol": 5e-3, "rtol": 5e-3},
-    torch.float32: {"atol": 2e-7, "rtol": 1e-7},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-3},
+    InfiniDtype.F32: {"atol": 2e-7, "rtol": 1e-7},
 }

 DEBUG = False
@@ -73,111 +74,79 @@ NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class SwiGLUDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
-
-
 def swiglu(a, b):
    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))


-def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
-    """
-    rearrange the tensors if needed and apply the inplace config.
-    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
-    the inplace config is ignored and out-of-place is used
-    """
-    original_c_strides = c_strides if c_strides else c.stride()
-
-    def _rearrange(tensor, strides):
-        if strides and 0 in strides:
-            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
-            return tensor
-        else:
-            return rearrange_if_needed(tensor, strides)
-
-    a, b, c = [
-        _rearrange(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
-    ]
-    c = (
-        c
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
-    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
-    if 0 in c.stride():
-        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
-
-    return a, b, c
-
-
 def test(
-    lib,
    handle,
-    torch_device,
+    device,
    shape,
    a_stride=None,
    b_stride=None,
    c_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
+    dtype=InfiniDtype.F16,
    sync=None,
 ):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
    print(
-        f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{dtype} inplace:{inplace}"
+        f"Testing SwiGLU on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
    )

-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
-    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
+    ans = swiglu(a.torch_tensor(), b.torch_tensor())

-    ans = swiglu(a, b)
-
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-    c_tensor = (
-        to_tensor(c, lib)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
-    )
    if sync is not None:
        sync()

-    descriptor = infiniopSwiGLUDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateSwiGLUDescriptor(
+        LIBINFINIOP.infiniopCreateSwiGLUDescriptor(
            handle,
            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.destroyDesc(lib)
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()

    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetSwiGLUWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        LIBINFINIOP.infiniopGetSwiGLUWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
    )
-    workspace = create_workspace(workspace_size.value, c.device)
+    workspace = TestWorkspace(workspace_size.value, c.device)

    def lib_swiglu():
        check_error(
-            lib.infiniopSwiGLU(
+            LIBINFINIOP.infiniopSwiGLU(
                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
+                workspace.data(),
                workspace_size.value,
-                c_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
+                c.data(),
+                a.data(),
+                b.data(),
                None,
            )
        )
@@ -186,52 +155,20 @@ def test(

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+        debug(c.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), ans, atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: swiglu(a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_swiglu(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+    check_error(LIBINFINIOP.infiniopDestroySwiGLUDescriptor(descriptor))


 if __name__ == "__main__":
    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
-    lib.infiniopCreateSwiGLUDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopSwiGLUDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
-    lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
-        infiniopSwiGLUDescriptor_t,
-        POINTER(c_uint64),
-    ]
-
-    lib.infiniopSwiGLU.restype = c_int32
-    lib.infiniopSwiGLU.argtypes = [
-        infiniopSwiGLUDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
-    lib.infiniopDestroySwiGLUDescriptor.argtypes = [
-        infiniopSwiGLUDescriptor_t,
-    ]

    # Configure testing options
    DEBUG = args.debug
@@ -240,6 +177,6 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations

    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92mTest passed!\033[0m")
--- a/xmake.lua
+++ b/xmake.lua
@@ -12,11 +12,17 @@ if is_mode("debug") then
    add_defines("DEBUG_MODE")
 end

+if is_plat("windows") then
+    set_runtimes("MD")
+    add_ldflags("/utf-8", {force = true})
+    add_cxflags("/utf-8", {force = true})
+end
+
 -- CPU
 option("cpu")
    set_default(true)
    set_showmenu(true)
-    set_description("Whether to complie implementations for CPU")
+    set_description("Whether to compile implementations for CPU")
 option_end()

 option("omp")
@@ -38,32 +44,29 @@ end
 option("nv-gpu")
    set_default(false)
    set_showmenu(true)
-    set_description("Whether to complie implementations for Nvidia GPU")
+    set_description("Whether to compile implementations for Nvidia GPU")
 option_end()

 if has_config("nv-gpu") then
-    add_defines("ENABLE_CUDA_API")
-    includes("xmake/cuda.lua")
+    add_defines("ENABLE_NVIDIA_API")
+    includes("xmake/nvidia.lua")
 end

-- 天数智芯
-option("iluvatar-gpu")
-    set_default(false)
+option("cudnn")
+    set_default(true)
    set_showmenu(true)
-    set_description("Whether to complie implementations for Iluvatar GPU")
+    set_description("Whether to compile cudnn for Nvidia GPU")
 option_end()

-if has_config("iluvatar-gpu") then
-    add_defines("ENABLE_CUDA_API")
-    add_defines("ENABLE_ILUVATAR_CUDA_API")
-    includes("xmake/iluvatar.lua")
+if has_config("cudnn") then
+    add_defines("ENABLE_CUDNN_API")
 end

 -- 寒武纪
 option("cambricon-mlu")
    set_default(false)
    set_showmenu(true)
-    set_description("Whether to complie implementations for Cambricon MLU")
+    set_description("Whether to compile implementations for Cambricon MLU")
 option_end()

 if has_config("cambricon-mlu") then
@@ -75,7 +78,7 @@ end
 option("ascend-npu")
    set_default(false)
    set_showmenu(true)
-    set_description("Whether to complie implementations for Huawei Ascend NPU")
+    set_description("Whether to compile implementations for Huawei Ascend NPU")
 option_end()

 if has_config("ascend-npu") then
@@ -83,23 +86,35 @@ if has_config("ascend-npu") then
    includes("xmake/ascend.lua")
 end

+-- 天数智芯
+option("iluvatar-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to compile implementations for Iluvatar GPU")
+option_end()
+
+if has_config("iluvatar-gpu") then
+    add_defines("ENABLE_ILUVATAR_API")
+    includes("xmake/iluvatar.lua")
+end
+
 -- 沐曦
 option("metax-gpu")
    set_default(false)
    set_showmenu(true)
-    set_description("Whether to complie implementations for MetaX GPU")
+    set_description("Whether to compile implementations for MetaX GPU")
 option_end()

 if has_config("metax-gpu") then
    add_defines("ENABLE_METAX_API")
-    includes("xmake/maca.lua")
+    includes("xmake/metax.lua")
 end

 -- 摩尔线程
 option("moore-gpu")
    set_default(false)
    set_showmenu(true)
-    set_description("Whether to complie implementations for Moore Threads GPU")
+    set_description("Whether to compile implementations for Moore Threads GPU")
 option_end()

 if has_config("moore-gpu") then
@@ -111,11 +126,10 @@ end
 option("sugon-dcu")
    set_default(false)
    set_showmenu(true)
-    set_description("Whether to complie implementations for Sugon DCU")
+    set_description("Whether to compile implementations for Sugon DCU")
 option_end()

 if has_config("sugon-dcu") then
-    add_defines("ENABLE_CUDA_API")
    add_defines("ENABLE_SUGON_CUDA_API")
 end

@@ -131,12 +145,22 @@ if has_config("kunlun-xpu") then
    includes("xmake/kunlun.lua")
 end

+-- 九齿
+option("ninetoothed")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie NineToothed implementations")
+option_end()
+
+if has_config("ninetoothed") then
+    add_defines("ENABLE_NINETOOTHED")
+end
+
 -- InfiniCCL
 option("ccl")
-set_default(false)
    set_default(false)
    set_showmenu(true)
-    set_description("Wether to complie implementations for InfiniCCL")
+    set_description("Wether to compile implementations for InfiniCCL")
 option_end()

 if has_config("ccl") then
@@ -159,7 +183,7 @@ target("infini-utils")
        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
        if has_config("omp") then
            add_cxflags("-fopenmp")
-            add_ldflags("-fopenmp")
+            add_ldflags("-fopenmp", {force = true})
        end
    end

@@ -173,7 +197,7 @@ target("infinirt")
        add_deps("infinirt-cpu")
    end
    if has_config("nv-gpu") then
-        add_deps("infinirt-cuda")
+        add_deps("infinirt-nvidia")
    end
    if has_config("cambricon-mlu") then
        add_deps("infinirt-cambricon")
@@ -207,7 +231,7 @@ target("infiniop")
        add_deps("infiniop-cpu")
    end
    if has_config("nv-gpu") then
-        add_deps("infiniop-cuda")
+        add_deps("infiniop-nvidia")
    end
    if has_config("iluvatar-gpu") then
        add_deps("infiniop-iluvatar")
@@ -221,9 +245,9 @@ target("infiniop")
        )
        add_shflags("-s", "-shared", "-fPIC")
        add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl")
-        -- Using -linfiniop-cuda will fail, manually link the target using full path
+        -- Using -linfiniop-nvidia will fail, manually link the target using full path
        add_deps("nv-gpu", {inherit = false})
-        add_links(builddir.."/libinfiniop-cuda.a")
+        add_links(builddir.."/libinfiniop-nvidia.a")
        set_toolchains("sugon-dcu-linker")
    end

@@ -259,7 +283,7 @@ target("infiniccl")
    add_deps("infinirt")

    if has_config("nv-gpu") then
-        add_deps("infiniccl-cuda")
+        add_deps("infiniccl-nvidia")
    end
    if has_config("ascend-npu") then
        add_deps("infiniccl-ascend")
@@ -270,6 +294,9 @@ target("infiniccl")
    if has_config("metax-gpu") then
        add_deps("infiniccl-metax")
    end
+    if has_config("iluvatar-gpu") then
+        add_deps("infiniccl-iluvatar")
+    end

    set_languages("cxx17")


--- a/xmake/iluvatar.lua
+++ b/xmake/iluvatar.lua
@@ -42,13 +42,17 @@ target("infiniop-iluvatar")
    add_links("cudart", "cublas", "cudnn")

    set_warnings("all", "error")
+    add_cuflags("-Wno-error=unused-private-field")
    add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
-    add_cuflags("-fPIC")
    add_culdflags("-fPIC")
    add_cxflags("-fPIC")

    -- set_languages("cxx17") 天数似乎不能用这个配置
-    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+
+    if has_config("ninetoothed") then
+        add_files("../build/ninetoothed/*.c", {cxflags = {"-Wno-return-type"}})
+    end
 target_end()

 target("infinirt-iluvatar")
@@ -64,10 +68,39 @@ target("infinirt-iluvatar")

    set_warnings("all", "error")
    add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
-    add_cuflags("-fPIC")
    add_culdflags("-fPIC")
    add_cxflags("-fPIC")

    -- set_languages("cxx17") 天数似乎不能用这个配置
    add_files("../src/infinirt/cuda/*.cu")
 target_end()
+
+target("infiniccl-iluvatar")
+    set_kind("static")
+    add_deps("infinirt")
+    on_install(function (target) end)
+
+    if has_config("ccl") then
+        set_toolchains("iluvatar.toolchain")
+        add_rules("iluvatar.env")
+        set_values("cuda.rdc", false)
+
+        add_links("cudart")
+
+        set_warnings("all", "error")
+        add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
+        add_culdflags("-fPIC")
+        add_cxflags("-fPIC")
+
+        local nccl_root = os.getenv("NCCL_ROOT")
+        if nccl_root then
+            add_includedirs(nccl_root .. "/include")
+            add_links(nccl_root .. "/lib/libnccl.so")
+        else
+            add_links("nccl") -- Fall back to default nccl linking
+        end
+
+        -- set_languages("cxx17") 天数似乎不能用这个配置
+        add_files("../src/infiniccl/cuda/*.cu")
+    end
+target_end()
--- a/xmake/maca.lua
+++ b/xmake/maca.lua
@@ -23,6 +23,11 @@ rule("maca")
            table.insert(args, "-I" .. includedir)
        end

+        local defines = target:get("defines")
+        for _, define in ipairs(defines) do
+            table.insert(args, "-D" .. define)
+        end
+
        os.execv(htcc, args)
        table.insert(target:objectfiles(), objectfile)
    end)
@@ -34,8 +39,12 @@ target("infiniop-metax")
    set_languages("cxx17")
    set_warnings("all", "error")
    add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
-    add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
-    add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
+    add_files("../src/infiniop/devices/metax/*.cc", "../src/infiniop/ops/*/metax/*.cc")
+    add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
+
+    if has_config("ninetoothed") then
+        add_files("../build/ninetoothed/*.c", {cxflags = {"-include stdlib.h", "-Wno-return-type"}})
+    end
 target_end()

 target("infinirt-metax")
@@ -45,7 +54,7 @@ target("infinirt-metax")
    add_deps("infini-utils")
    set_warnings("all", "error")
    add_cxflags("-lstdc++ -fPIC")
-    add_files("../src/infinirt/maca/*.cc")
+    add_files("../src/infinirt/metax/*.cc")
 target_end()

 target("infiniccl-metax")
@@ -58,8 +67,8 @@ target("infiniccl-metax")
    end
    if has_config("ccl") then
        add_links("libhccl.so")
-        add_files("../src/infiniccl/maca/*.cc")
+        add_files("../src/infiniccl/metax/*.cc")
    end
    set_languages("cxx17")
-    
+
 target_end()
--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
-
-local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
 local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
-if CUDA_ROOT ~= nil then
-    add_includedirs(CUDA_ROOT .. "/include")
-end
 if CUDNN_ROOT ~= nil then
    add_includedirs(CUDNN_ROOT .. "/include")
 end

-target("infiniop-cuda")
+target("infiniop-nvidia")
    set_kind("static")
    add_deps("infini-utils")
    on_install(function (target) end)

    set_policy("build.cuda.devlink", true)
    set_toolchains("cuda")
-    add_links("cublas", "cudnn")
+    add_links("cudart", "cublas")
+    if has_config("cudnn") then
+        add_links("cudnn")
+    end
    add_cugencodes("native")

+    on_load(function (target)
+        import("lib.detect.find_tool")
+        local nvcc = find_tool("nvcc")
+        if nvcc ~= nil then
+            if is_plat("windows") then
+                nvcc_path = os.iorun("where nvcc"):match("(.-)\r?\n")
+            else
+                nvcc_path = nvcc.program
+            end
+
+            target:add("linkdirs", path.directory(path.directory(nvcc_path)) .. "/lib64/stubs")
+            target:add("links", "cuda")
+        end
+    end)
+
    if is_plat("windows") then
        add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
        add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
@@ -31,13 +44,23 @@ target("infiniop-cuda")
        add_cuflags("--extended-lambda")
        add_culdflags("-Xcompiler=-fPIC")
        add_cxxflags("-fPIC")
+        add_cuflags("--expt-relaxed-constexpr")
+        if CUDNN_ROOT ~= nil then
+            add_linkdirs(CUDNN_ROOT .. "/lib")
+        end
    end

+    add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
+
    set_languages("cxx17")
-    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+
+    if has_config("ninetoothed") then
+        add_files("../build/ninetoothed/*.c")
+    end
 target_end()

-target("infinirt-cuda")
+target("infinirt-nvidia")
    set_kind("static")
    add_deps("infini-utils")
    on_install(function (target) end)
@@ -59,7 +82,7 @@ target("infinirt-cuda")
    add_files("../src/infinirt/cuda/*.cu")
 target_end()

-target("infiniccl-cuda")
+target("infiniccl-nvidia")
    set_kind("static")
    add_deps("infinirt")
    on_install(function (target) end)
@@ -87,5 +110,5 @@ target("infiniccl-cuda")
        end
    end
    set_languages("cxx17")
-    
+
 target_end()
--- a/xmake/test.lua
+++ b/xmake/test.lua
@@ -51,3 +51,15 @@ target("infiniccl-test")

    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 target_end()
+
+target("infinirt-test")
+    set_kind("binary")
+    add_deps("infinirt")
+    on_install(function (target) end)
+
+    set_languages("cxx17")
+    set_warnings("all", "error")
+
+    add_files(os.projectdir().."/src/infinirt-test/*.cc")
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+target_end()