feat: cpu and cuda matmul

46da1a27 · PanZezhongQY · 46da1a27 · 46da1a27 · 46da1a27 · 46da1a27
Commit 46da1a27 authored Feb 11, 2025 by PanZezhongQY
20 changed files
--- a/test/infiniop/__init__.py
+++ b/test/infiniop/__init__.py
+import libinfiniop
\ No newline at end of file
--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+class AddDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAddDescriptor_t = POINTER(AddDescriptor)
+
+
+def add(x, y):
+    return torch.add(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+
+    ans = add(a, b)
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopAddDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateAddDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    c_tensor.descriptor.contents.invalidate()
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+    )
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
+        ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
+        ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateAddDescriptor.restype = c_int32
+    lib.infiniopCreateAddDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAddDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopAdd.restype = c_int32
+    lib.infiniopAdd.argtypes = [
+        infiniopAddDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAddDescriptor.restype = c_int32
+    lib.infiniopDestroyAddDescriptor.argtypes = [
+        infiniopAddDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import torch.nn.functional as F
+
+
+class AttentionDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
+
+
+def causal_softmax(x):
+    type = x.dtype
+    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    y = x.clone()
+    masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
+    return torch.nn.functional.softmax(masked, dim=-1).to(type)
+
+
+def attention(q, k, v, k_cache, v_cache, pos):
+    type = q.dtype
+
+    n_q_head = q.shape[0]
+    n_kv_head = k.shape[0]
+
+    # Concatenate key and value caches
+    k_cache = k_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    v_cache = v_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    k = torch.cat([k_cache, k], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+    v = torch.cat([v_cache, v], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+
+    total_seq_len = k.shape[1]
+
+    head_dim = v.shape[-1]
+
+    if n_q_head != n_kv_head:
+        q = q.reshape(
+            n_kv_head, -1, head_dim
+        )  # (n_kv_head, n_group * seq_len, head_dim)
+
+    # Scaled dot-product attention
+    attn_scores = (
+        torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32))
+        .to(type)
+        .reshape(n_q_head, -1, total_seq_len)
+    )  # (n_q_head, seq_len, total_seq_len)
+    attn_scores = attn_scores / (head_dim**0.5)
+
+    attn_weights = causal_softmax(attn_scores).reshape(
+        n_kv_head, -1, total_seq_len
+    )  # (n_kv_head, seq_len, total_seq_len)
+
+    # Weighted sum of values
+    attn_output = (
+        torch.einsum(
+            "hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32)
+        )
+        .to(type)
+        .reshape(n_q_head, -1, head_dim)
+        .permute(1, 0, 2)
+    )  # ([seq_len, n_q_head, head_dim])
+
+    return attn_output
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    n_q_head,
+    n_kv_head,
+    seq_len,
+    head_dim,
+    pos,
+    k_cache_buf_len,
+    v_cache_buf_len,
+    dtype=torch.float16,
+    q_stride=None,
+    k_stride=None,
+    v_stride=None,
+    k_cache_stride=None,
+    v_cache_stride=None,
+):
+    print(
+        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
+        f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
+    )
+
+    out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
+    q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k_cache = (
+        torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        * 0.1
+    )
+    v_cache = (
+        torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        * 0.1
+    )
+
+    ans = attention(q, k, v, k_cache, v_cache, pos)
+
+    if q_stride is not None:
+        q = rearrange_tensor(q, q_stride)
+    if k_stride is not None:
+        k = rearrange_tensor(k, k_stride)
+    if v_stride is not None:
+        v = rearrange_tensor(v, v_stride)
+    if k_cache_stride is not None:
+        k_cache = rearrange_tensor(k_cache, k_cache_stride)
+    if v_cache_stride is not None:
+        v_cache = rearrange_tensor(v_cache, v_cache_stride)
+
+    out_tensor = to_tensor(out, lib)
+    q_tensor = to_tensor(q, lib)
+    k_tensor = to_tensor(k, lib)
+    v_tensor = to_tensor(v, lib)
+    k_cache_tensor = to_tensor(k_cache, lib)
+    v_cache_tensor = to_tensor(v_cache, lib)
+
+    descriptor = infiniopAttentionDescriptor_t()
+    check_error(
+        lib.infiniopCreateAttentionDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out_tensor.descriptor,
+            q_tensor.descriptor,
+            k_tensor.descriptor,
+            v_tensor.descriptor,
+            k_cache_tensor.descriptor,
+            v_cache_tensor.descriptor,
+            pos,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    out_tensor.descriptor.contents.invalidate()
+    q_tensor.descriptor.contents.invalidate()
+    k_tensor.descriptor.contents.invalidate()
+    v_tensor.descriptor.contents.invalidate()
+    k_cache_tensor.descriptor.contents.invalidate()
+    v_cache_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, out.device)
+
+    check_error(
+        lib.infiniopAttention(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            out_tensor.data,
+            q_tensor.data,
+            k_tensor.data,
+            v_tensor.data,
+            k_cache_tensor.data,
+            v_cache_tensor.data,
+            None,
+        )
+    )
+
+    assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
+
+    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # prefill
+        (
+            32,  # n_q_head
+            4,  # n_kv_head
+            5,  # seq_len
+            64,  # head_dim
+            0,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            torch.float16,  # dtype
+            [64, 2560, 1],  # q_stride
+            [64, 2560, 1],  # k_stride
+            [64, 2560, 1],  # v_stride
+            [64, 11264, 1],  # k_cache_stride
+            [64, 11264, 1],  # v_cache_stride
+        ),
+        # decode
+        (
+            32,  # n_q_head
+            4,  # n_kv_head
+            1,  # seq_len
+            64,  # head_dim
+            3,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            torch.float16,  # dtype
+            [64, 2560, 1],  # q_stride
+            [64, 2560, 1],  # k_stride
+            [64, 2560, 1],  # v_stride
+            [64, 11264, 1],  # k_cache_stride
+            [64, 11264, 1],  # v_cache_stride
+        ),
+        # for test
+        (
+            8,  # n_q_head
+            4,  # n_kv_head
+            2,  # seq_len
+            16,  # head_dim
+            1,  # pos
+            8,  # k_cache_buf_len
+            8,  # v_cache_buf_len
+            torch.float16,  # dtype
+            None,  # q_stride
+            None,  # k_stride
+            None,  # v_stride
+            None,  # k_cache_stride
+            None,  # v_cache_stride
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateAttentionDescriptor.restype = c_int32
+    lib.infiniopCreateAttentionDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAttentionDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_uint64,
+    ]
+
+    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
+    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
+        infiniopAttentionDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopAttention.restype = c_int32
+    lib.infiniopAttention.argtypes = [
+        infiniopAttentionDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
+    lib.infiniopDestroyAttentionDescriptor.argtypes = [
+        infiniopAttentionDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class AvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
+
+
+def pool(x, k, padding, stride, dilation = 1):
+    pooling_layers = {
+        1: torch.nn.AvgPool1d,
+        2: torch.nn.AvgPool2d,
+        3: torch.nn.AvgPool3d,
+    }
+
+    ndim = len(x.shape) - 2
+    if ndim not in pooling_layers:
+        print("Error: Pytorch -> Unsupported tensor dimension")
+        return None
+
+    if ndim == 3 and x.dtype == torch.float16:
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
+    else:
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return ans
+
+
+def inferShape(x_shape, kernel_shape, padding, strides):
+    assert (
+        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
+    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
+    input_shape = x_shape[2:]
+    output_shape = []
+
+    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
+        output_dim = (dim + 2 * p - k) // s + 1
+        output_shape.append(output_dim)
+
+    return x_shape[:2] + tuple(output_shape)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape, 
+    k_shape, 
+    padding,
+    strides,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = pool(x, k_shape, padding, strides)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            tuple_to_void_p(k_shape),
+            tuple_to_void_p(padding),
+            tuple_to_void_p(strides),
+            len(k_shape),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopAvgPool(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, kernel_shape, padding, strides
+        ((1, 1, 10), (3,), (1,), (1,)),
+        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
+        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopAvgPool.restype = c_int32
+    lib.infiniopAvgPool.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
+import sys
+import os
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class CausalSoftmaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
+
+
+def causal_softmax(x):
+    type = x.dtype
+    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    y = x.clone()
+    masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
+    return torch.nn.functional.softmax(masked, dim=-1).to(type)
+
+
+def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16):
+    print(
+        f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    ans = causal_softmax(x)
+    x_tensor = to_tensor(x, lib)
+    descriptor = infiniopCausalSoftmaxDescriptor_t()
+    check_error(
+        lib.infiniopCreateCausalSoftmaxDescriptor(
+            handle, ctypes.byref(descriptor), x_tensor.descriptor
+        )
+    )
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetCausalSoftmaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+
+    workspace = create_workspace(workspace_size.value, x.device)
+    check_error(
+        lib.infiniopCausalSoftmax(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            x_tensor.data,
+            None,
+        )
+    )
+    assert torch.allclose(x, ans, atol=0, rtol=1e-2)
+    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cpu", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cuda", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "mlu", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "npu", x_shape, x_stride)
+
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, x_stride
+        ((32, 20, 512), None),
+        ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopCausalSoftmaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopCausalSoftmax.restype = c_int32
+    lib.infiniopCausalSoftmax.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import math
+import ctypes
+from torch.nn import functional as F
+from typing import List, Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ConvDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopConvDescriptor_t = POINTER(ConvDescriptor)
+
+
+def conv(x, w, stride, padding, dilation):
+    match len(x.shape) - 2:
+        case 1:
+            return F.conv1d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case 2:
+            return F.conv2d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case 3:
+            return F.conv3d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case _:
+            print("Error: Pytorch -> Unsupported tensor dimension")
+            return None
+
+
+# infer the shape of the output given the inputs for a N-ary convolution
+def inferShape(
+    x_shape: List[int],
+    w_shape: List[int],
+    pads: List[int],
+    strides: List[int],
+    dilations: List[int],
+) -> Tuple[int, ...]:
+    assert (
+        len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
+    ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
+    output_dims = [
+        math.floor(
+            (x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
+            / strides[i]
+            + 1
+        )
+        for i in range(len(pads))
+    ]
+    return (x_shape[0], w_shape[0]) + tuple(output_dims)
+
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    w_shape,
+    pads,
+    strides,
+    dilations,
+    tensor_stride=None,
+    tensor_dtype=torch.float16,
+):
+    assert len(pads) == len(strides) == len(dilations)
+    print(
+        f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(
+        inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
+    ).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = conv(x, w, strides, pads, dilations)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = conv(x, w, strides, pads, dilations)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    w_tensor = to_tensor(w, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopConvDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateConvDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w_tensor.descriptor,
+            tuple_to_void_p(pads),
+            tuple_to_void_p(strides),
+            tuple_to_void_p(dilations),
+            len(pads),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    w_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopConv(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                w_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopConv(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    else:
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyConvDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, w_shape, pads, strides, dilations, x_strides
+        (
+            (32, 3, 4),
+            (32, 3, 5),
+            (1,),
+            (1,),
+            (1,),
+            None,
+        ),
+        (
+            (1, 3, 4, 4),
+            (2, 3, 3, 3),
+            (1, 1),
+            (1, 2),
+            (2, 1),
+            None,
+        ),
+        (
+            (32, 3, 128, 128),
+            (64, 3, 5, 5),
+            (2, 2),
+            (2, 2),
+            (1, 1),
+            None,
+        ),
+        (
+            (1, 1, 4, 4, 4),
+            (1, 1, 5, 5, 5),
+            (1, 1, 1),
+            (1, 1, 1),
+            (1, 1, 1),
+            None,
+        ),
+        (
+            (32, 3, 32, 32, 32),
+            (64, 3, 5, 5, 5),
+            (3, 2, 2),
+            (4, 3, 3),
+            (2, 2, 1),
+            None,
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateConvDescriptor.restype = c_int32
+    lib.infiniopCreateConvDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopConvDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopConv.restype = c_int32
+    lib.infiniopConv.argtypes = [
+        infiniopConvDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyConvDescriptor.restype = c_int32
+    lib.infiniopDestroyConvDescriptor.argtypes = [
+        infiniopConvDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ExpandDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
+
+
+def expand(x, y):
+    if PROFILE:
+        ans = x.expand_as(y).clone()
+        torch.cuda.synchronize()
+        return ans
+    return x.expand_as(y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    y_shape, 
+    x_shape,
+    y_stride=None, 
+    x_stride=None, 
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
+
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = expand(x, y)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = expand(x, y)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopExpandDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateExpandDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # y_shape, x_shape, y_stride, x_stride
+        ((), (), None, None),
+        ((3, 3), (1,), None, None),
+        ((5, 4, 3), (4, 3,), None, (6, 1)),
+        ((99, 111), (111,), None, None),
+        ((2, 4, 3), (1, 3), None, None),
+        ((2, 20, 3), (2, 1, 3), None, None),
+        ((2, 3, 4, 5), (5,), None, None),
+        ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
+        ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateExpandDescriptor.restype = c_int32
+    lib.infiniopCreateExpandDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopExpandDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopExpand.restype = c_int32
+    lib.infiniopExpand.argtypes = [
+        infiniopExpandDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyExpandDescriptor.restype = c_int32
+    lib.infiniopDestroyExpandDescriptor.argtypes = [
+        infiniopExpandDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class GEMMDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
+
+
+def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
+    A = A.T if transA else A
+    B = B.T if transB else B
+    result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
+    if C is not None:
+        result += beta * C if dtype != torch.float16 else C.to(torch.float32)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return result
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    alpha,
+    beta,
+    transA,
+    transB,
+    a_shape,
+    b_shape,
+    c_shape,
+    y_shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+):
+    print(
+        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " 
+        f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
+        f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
+    )
+
+    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
+    c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
+    y = torch.rand(y_shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None and c is not None:
+        c = rearrange_tensor(c, c_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib) if c is not None else None
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGEMMDescriptor_t()
+    check_error(
+        lib.infiniopCreateGEMMDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+            c_tensor.descriptor if c_tensor else None,
+            alpha,
+            beta,
+            transA,
+            transB,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    if c_tensor is not None:
+        c_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGEMMWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGEMM(
+                descriptor,
+                workspace_ptr,
+                workspace_size,
+                y_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                c_tensor.data if c_tensor else None,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGEMM(
+                    descriptor,
+                    workspace_ptr,
+                    workspace_size,
+                    y_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    c_tensor.data if c_tensor else None,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
+        (
+            1.0,
+            1.0,
+            False,
+            False,
+            (1, 2048),
+            (2048, 2048),
+            (1, 2048),
+            (1, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            True,
+            (2048, 4),
+            (2048, 2048),
+            (4, 2048),
+            (4, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            False,
+            True,
+            (1, 2048),
+            (1000, 2048),
+            (1000),
+            (1, 1000),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            False,
+            (2048, 4),
+            (2048, 2048),
+            (2048),
+            (4, 2048),
+            (4096, 1),
+            (4096, 1),
+            (2,),
+            (4096, 1),
+        ),
+        (
+            1.0,
+            1.0,
+            False,
+            False,
+            (3, 1, 2048),
+            (3, 2048, 2048),
+            (1,),
+            (3, 1, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            False,
+            (2048, 4),
+            (2048, 2048),
+            None,
+            (4, 2048),
+            (4096, 1),
+            (4096, 1),
+            (2,),
+            (4096, 1),
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateGEMMDescriptor.restype = c_int32
+    lib.infiniopCreateGEMMDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGEMMDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+        c_bool,
+        c_bool,
+    ]
+
+    lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
+    lib.infiniopGetGEMMWorkspaceSize.argtypes = [
+        infiniopGEMMDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopGEMM.restype = c_int32
+    lib.infiniopGEMM.argtypes = [
+        infiniopGEMMDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGEMMDescriptor.restype = c_int32
+    lib.infiniopDestroyGEMMDescriptor.argtypes = [
+        infiniopGEMMDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch, time
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class GlobalAvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
+
+
+def inferShape(x):
+    return x.shape[:2] + (1,) * (x.dim() - 2)
+
+
+def globalAvgPool(x):
+    y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return y.view(*inferShape(x))
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = globalAvgPool(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = globalAvgPool(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateGlobalAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGlobalAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspaceSize)
+        )
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGlobalAvgPool(
+                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGlobalAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape
+        ((1, 3, 3)),
+        ((1, 3, 1, 1, 3)),
+        ((1, 3, 1, 1, 257)),
+        ((1, 2, 1, 1, 514)),
+        ((1, 3, 1, 1, 1025)),
+        ((32, 256, 1, 112, 112)),
+        ((2, 3, 2048000)),
+        ((2, 1, 10243)),
+        ((2, 20, 100)),
+        ((3, 33, 333)),
+        ((32, 20, 512)),
+        ((3, 3, 11, 11, 11, 3, 2)),
+        ((32, 256, 1, 112, 112)),
+        ((32, 256, 112, 112)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGlobalAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopGlobalAvgPool.restype = c_int32
+    lib.infiniopGlobalAvgPool.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/libinfiniop/__init__.py
+++ b/test/infiniop/libinfiniop/__init__.py
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
+from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
+from .devices import *
+from .utils import *
+from .datatypes import *
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
+class InfiniDtype:
+    INVALID = 0
+    BYTE = 1
+    BOOL = 2
+    I8 = 3
+    I16 = 4
+    I32 = 5
+    I64 = 6
+    U8 = 7
+    U16 = 8 
+    U32 = 9
+    U64 = 10
+    F8 = 11
+    F16 = 12
+    F32 = 13
+    F64 = 14
+    C8 = 15
+    C16 = 16
+    C32 = 17
+    C64 = 18
+    BF16 = 19
--- a/test/infiniop/libinfiniop/devices.py
+++ b/test/infiniop/libinfiniop/devices.py
+class InfiniDeviceEnum:
+    CPU = 0
+    NVIDIA = 1
+    CAMBRICON = 2
+    ASCEND = 3
+    METAX = 4,
+    MOORE = 5,
+    ILUVATAR = 6,
+    KUNLUN = 7,
+    SUGON = 8,
--- a/test/infiniop/libinfiniop/liboperators.py
+++ b/test/infiniop/libinfiniop/liboperators.py
+from calendar import c
+import os
+import platform
+import ctypes
+from ctypes import c_int, c_int64, c_uint64, Structure, POINTER, c_size_t
+from .datatypes import *
+from .devices import *
+
+Device = c_int
+Optype = c_int
+
+INFINI_ROOT = os.environ.get("INFINI_ROOT")
+
+
+class TensorDescriptor(Structure):
+    _fields_ = [
+        ("dtype", c_int),
+        ("ndim", c_size_t),
+        ("shape", POINTER(c_size_t)),
+        ("strides", POINTER(c_int64)),
+    ]
+
+    def invalidate(self):
+        for i in range(self.ndim):
+            self.shape[i] = 0
+            self.strides[i] = 0
+
+
+infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
+
+
+class CTensor:
+    def __init__(self, desc, data):
+        self.descriptor = desc
+        self.data = data
+
+
+class Handle(Structure):
+    _fields_ = [("device", c_int), ("device_id", c_int)]
+
+
+infiniopHandle_t = POINTER(Handle)
+
+
+# Open operators library
+def open_lib():
+    def find_library_in_ld_path(subdir, library_name):
+        ld_library_path = os.path.join(INFINI_ROOT, subdir)
+        paths = ld_library_path.split(os.pathsep)
+        for path in paths:
+            full_path = os.path.join(path, library_name)
+            if os.path.isfile(full_path):
+                return full_path
+        return None
+
+    system_name = platform.system()
+    # Load the library
+    if system_name == "Windows":
+        library_path = find_library_in_ld_path("bin", "infiniop.dll")
+    elif system_name == "Linux":
+        library_path = find_library_in_ld_path("lib", "libinfiniop.so")
+
+    assert (
+        library_path is not None
+    ), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
+    ctypes.CDLL(r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\cudnn64_9.dll")
+    lib = ctypes.CDLL(library_path)
+    lib.infiniopCreateTensorDescriptor.argtypes = [
+        POINTER(infiniopTensorDescriptor_t),
+        c_uint64,
+        POINTER(c_uint64),
+        POINTER(c_int64),
+        c_int,
+    ]
+    lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t), c_int, c_int]
+    lib.infiniopCreateHandle.restype = c_int
+    lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t]
+    lib.infiniopDestroyHandle.restype = c_int
+
+    return lib
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
+import ctypes
+from .datatypes import *
+from .liboperators import infiniopTensorDescriptor_t, CTensor, infiniopHandle_t
+
+
+def check_error(status):
+    if status != 0:
+        raise Exception("Error code " + str(status))
+
+
+def to_tensor(tensor, lib):
+    """
+    Convert a PyTorch tensor to a library Tensor(descriptor, data).
+    """
+    import torch
+
+    ndim = tensor.ndimension()
+    shape = (ctypes.c_size_t * ndim)(*tensor.shape)
+    strides = (ctypes.c_int64 * ndim)(*(tensor.stride()))
+    data_ptr = tensor.data_ptr()
+    # fmt: off
+    dt = (
+        InfiniDtype.I8 if tensor.dtype == torch.int8 else
+        InfiniDtype.I16 if tensor.dtype == torch.int16 else
+        InfiniDtype.I32 if tensor.dtype == torch.int32 else
+        InfiniDtype.I64 if tensor.dtype == torch.int64 else
+        InfiniDtype.U8 if tensor.dtype == torch.uint8 else
+        InfiniDtype.F16 if tensor.dtype == torch.float16 else
+        InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else
+        InfiniDtype.F32 if tensor.dtype == torch.float32 else
+        InfiniDtype.F64 if tensor.dtype == torch.float64 else
+        # TODO: These following types may not be supported by older 
+        # versions of PyTorch.
+        InfiniDtype.U16 if tensor.dtype == torch.uint16 else
+        InfiniDtype.U32 if tensor.dtype == torch.uint32 else
+        InfiniDtype.U64 if tensor.dtype == torch.uint64 else
+        None
+    )
+    # fmt: on
+    assert dt is not None
+    # Create TensorDecriptor
+    tensor_desc = infiniopTensorDescriptor_t()
+    lib.infiniopCreateTensorDescriptor(
+        ctypes.byref(tensor_desc), ndim, shape, strides, dt
+    )
+    # Create Tensor
+    return CTensor(tensor_desc, data_ptr)
+
+def create_workspace(size, torch_device):
+    if size == 0:
+        return None
+    import torch
+    return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
+
+def create_handle(lib, device, id=0):
+    handle = infiniopHandle_t()
+    check_error(lib.infiniopCreateHandle(ctypes.byref(handle), device, id))
+    return handle
+
+
+def destroy_handle(lib, handle):
+    check_error(lib.infiniopDestroyHandle(handle))
+
+
+def rearrange_tensor(tensor, new_strides):
+    """
+    Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides.
+    """
+    import torch
+
+    shape = tensor.shape
+
+    new_size = [0] * len(shape)
+    left = 0
+    right = 0
+    for i in range(len(shape)):
+        if new_strides[i] > 0:
+            new_size[i] = (shape[i] - 1) * new_strides[i] + 1
+            right += new_strides[i] * (shape[i] - 1)
+        else:  # TODO: Support negative strides in the future
+            # new_size[i] = (shape[i] - 1) * (-new_strides[i]) + 1
+            # left += new_strides[i] * (shape[i] - 1)
+            raise ValueError("Negative strides are not supported yet")
+
+    # Create a new tensor with zeros
+    new_tensor = torch.zeros(
+        (right - left + 1,), dtype=tensor.dtype, device=tensor.device
+    )
+
+    # Generate indices for original tensor based on original strides
+    indices = [torch.arange(s) for s in shape]
+    mesh = torch.meshgrid(*indices, indexing="ij")
+
+    # Flatten indices for linear indexing
+    linear_indices = [m.flatten() for m in mesh]
+
+    # Calculate new positions based on new strides
+    new_positions = sum(
+        linear_indices[i] * new_strides[i] for i in range(len(shape))
+    ).to(tensor.device)
+    offset = -left
+    new_positions += offset
+
+    # Copy the original data to the new tensor
+    new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
+    new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
+
+    return new_tensor
--- a/test/infiniop/matmul.py
+++ b/test/infiniop/matmul.py
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.append("..")
+
+from libinfiniop import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    InfiniDeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from test_utils import get_args, synchronize_device
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class MatmulDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor)
+
+def matmul(_c, beta, _a, _b, alpha):
+    a = _a.clone()
+    b = _b.clone()
+    c = _c.clone()
+    input_dtype = c.dtype
+    ans = (
+        alpha * torch.matmul(a.to(torch.float32), b.to(torch.float32)).to(input_dtype)
+        + beta * c
+    )
+    return ans
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    alpha,
+    beta,
+    a_shape,
+    b_shape,
+    c_shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    dtype=torch.float16,
+):
+    print(
+        f"Testing Matmul on {torch_device} with a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape}"
+        f" a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
+    )
+
+    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
+    c = torch.ones(c_shape, dtype=dtype).to(torch_device)
+
+    ans = matmul(c, beta, a, b, alpha)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None:
+        c = rearrange_tensor(c, c_stride)
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib)
+    descriptor = infiniopMatmulDescriptor_t()
+    check_error(
+        lib.infiniopCreateMatmulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    c_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, a.device)
+
+    check_error(
+        lib.infiniopMatmul(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            c_tensor.data,
+            a_tensor.data,
+            b_tensor.data,
+            alpha,
+            beta,
+            None,
+        )
+    )
+
+    assert torch.allclose(c, ans, atol=0, rtol=1e-2)
+
+    if PROFILE:
+        for i in range(NUM_PRERUN):
+            _ = matmul(c, beta, a, b, alpha)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = matmul(c, beta, a, b, alpha)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopMatmul(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    c_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopMatmul(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    c_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
+    check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = InfiniDeviceEnum.CPU
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = InfiniDeviceEnum.NVIDIA
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    device = InfiniDeviceEnum.CAMBRICON
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = InfiniDeviceEnum.ASCEND
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "npu",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float16),
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float32),
+        (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float16),
+        (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float32),
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float16),
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float32),
+        (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float16),
+        (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float32),
+        (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float16),
+        (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float32),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateMatmulDescriptor.restype = c_int32
+    lib.infiniopCreateMatmulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMatmulDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
+    ]
+
+    lib.infiniopGetMatmulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMatmulWorkspaceSize.argtypes = [
+        infiniopMatmulDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMatmul.restype = c_int32
+    lib.infiniopMatmul.argtypes = [
+        infiniopMatmulDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_float,
+        c_float,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMatmulDescriptor.restype = c_int32
+    lib.infiniopDestroyMatmulDescriptor.argtypes = [
+        infiniopMatmulDescriptor_t,
+    ]
+
+    if args.profile:
+        PROFILE = True
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class MaxPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor)
+
+
+def pool(x, k, padding, stride, dilation = 1):
+    pooling_layers = {
+        1: torch.nn.MaxPool1d,
+        2: torch.nn.MaxPool2d,
+        3: torch.nn.MaxPool3d,
+    }
+
+    ndim = len(x.shape) - 2
+    if ndim not in pooling_layers:
+        print("Error: Pytorch -> Unsupported tensor dimension")
+        return None
+
+    ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return ans
+
+
+def inferShape(x_shape, kernel_shape, padding, strides):
+    assert (
+        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
+    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
+    input_shape = x_shape[2:]
+    output_shape = []
+
+    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
+        output_dim = (dim + 2 * p - k) // s + 1
+        output_shape.append(output_dim)
+
+    return x_shape[:2] + tuple(output_shape)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape, 
+    k_shape, 
+    padding,
+    strides,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = pool(x, k_shape, padding, strides)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopMaxPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateMaxPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            tuple_to_void_p(k_shape),
+            tuple_to_void_p(padding),
+            tuple_to_void_p(strides),
+            len(k_shape),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopMaxPool(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopMaxPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, kernel_shape, padding, strides
+        ((1, 1, 10), (3,), (1,), (1,)),
+        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
+        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
+    lib.infiniopCreateMaxPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMaxPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
+        infiniopMaxPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopMaxPool.restype = c_int32
+    lib.infiniopMaxPool.argtypes = [
+        infiniopMaxPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
+        infiniopMaxPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/mlp.py
+++ b/test/infiniop/mlp.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import torch.nn as nn
+
+
+class MLPDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMLPDescriptor_t = POINTER(MLPDescriptor)
+
+
+def swiglu(a, b):
+    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
+
+
+def mlp(y, x, w12, w3, alpha, residual):
+    input_dtype = x.dtype
+
+    intermediate_size = w3.shape[0]
+
+    a = torch.matmul(
+        x.to(torch.float32), w12[:, intermediate_size:].to(torch.float32)
+    ).to(input_dtype)
+    b = torch.matmul(
+        x.to(torch.float32), w12[:, 0:intermediate_size].to(torch.float32)
+    ).to(input_dtype)
+    c = swiglu(a, b)
+    d = torch.matmul(c.to(torch.float32), alpha * w3.to(torch.float32)).to(input_dtype)
+    out = d + y if residual else d
+    return out
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    alpha,
+    residual,
+    dtype=torch.float16,
+    x_stride=None,
+    y_stride=None,
+    w12_stride=None,
+    w3_stride=None,
+):
+    print(
+        f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
+        f" alpha:{alpha} residual:{residual} dtype:{dtype} x_stride:{x_stride} y_stride:{y_stride} w12_stride:{w12_stride} w3_stride:{w3_stride}"
+    )
+
+    y = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
+    x = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
+    w12 = (
+        torch.rand([hidden_size, 2 * intermediate_size], dtype=dtype).to(torch_device)
+        * 0.01
+    )
+    w3 = (
+        torch.rand([intermediate_size, hidden_size], dtype=dtype).to(torch_device)
+        * 0.01
+    )
+
+    ans = mlp(y, x, w12, w3, alpha, residual)
+
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+    if w12_stride is not None:
+        w12 = rearrange_tensor(w12, w12_stride)
+    if w3_stride is not None:
+        w3 = rearrange_tensor(w3, w3_stride)
+
+    y_tensor = to_tensor(y, lib)
+    x_tensor = to_tensor(x, lib)
+    w12_tensor = to_tensor(w12, lib)
+    w3_tensor = to_tensor(w3, lib)
+    descriptor = infiniopMLPDescriptor_t()
+    check_error(
+        lib.infiniopCreateMLPDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w12_tensor.descriptor,
+            w3_tensor.descriptor,
+            alpha,
+            residual,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    y_tensor.descriptor.contents.invalidate()
+    x_tensor.descriptor.contents.invalidate()
+    w12_tensor.descriptor.contents.invalidate()
+    w3_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMLPWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, x.device)
+
+    check_error(
+        lib.infiniopMLP(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            y_tensor.data,
+            x_tensor.data,
+            w12_tensor.data,
+            w3_tensor.data,
+            None,
+        )
+    )
+    assert torch.allclose(y, ans, atol=0, rtol=2e-2)
+
+    check_error(lib.infiniopDestroyMLPDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+        w12_stride,
+        w3_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+            w12_stride,
+            w3_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+        w12_stride,
+        w3_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+            w12_stride,
+            w3_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+        w12_stride,
+        w3_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+            w12_stride,
+            w3_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride
+        (4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None),
+        (4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None),
+        (
+            4,
+            4096,
+            11008,
+            1.0,
+            True,
+            torch.float16,
+            None,
+            None,
+            [1, 4096],
+            [1, 11008],
+        ),
+        (4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None),
+        (4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateMLPDescriptor.restype = c_int32
+    lib.infiniopCreateMLPDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMLPDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_bool,
+    ]
+
+    lib.infiniopGetMLPWorkspaceSize.restype = c_int32
+    lib.infiniopGetMLPWorkspaceSize.argtypes = [
+        infiniopMLPDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopMLP.restype = c_int32
+    lib.infiniopMLP.argtypes = [
+        infiniopMLPDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMLPDescriptor.restype = c_int32
+    lib.infiniopDestroyMLPDescriptor.argtypes = [
+        infiniopMLPDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+    U64,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class RandomSampleDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
+
+
+def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
+    indices = torch.zeros([topk], dtype = torch.int64)
+    dataNp = data.clone().detach()
+    sorted_indices = torch.arange(voc)
+    
+    for i in range(topk):
+        for j in range(i + 1, voc):
+            if(dataNp[i] < dataNp[j]):
+                tmp = dataNp[i].clone().detach()
+                dataNp[i] = dataNp[j].clone().detach()
+                dataNp[j] = tmp
+
+                tmpInd = sorted_indices[i].clone().detach()
+                sorted_indices[i] = sorted_indices[j].clone().detach()
+                sorted_indices[j] = tmpInd
+                
+    #sorted_indices = torch.argsort(dataNp, descending=True)
+    indices = sorted_indices[:topk] 
+    
+    dataNp = dataNp[sorted_indices]
+    
+    globalM = dataNp[0]
+    dataNp = (dataNp - globalM) / temperature
+    dataNp = torch.softmax(dataNp.float(), dim = 0)
+    sum_s = 0
+    for end in range(topk):
+        sum_s += dataNp[end]
+        if(sum_s >= topp):
+            break
+    if(end < topk - 1):
+        end += 1
+    else:
+        end = topk
+    
+    sum_s = 0
+    for i in range(end):
+        sum_s += dataNp[i]
+    random_val *= sum_s
+    
+    sum_s = 0
+    for i in range(end):
+        sum_s += dataNp[i]
+        if(random_val < sum_s):
+            return indices[i]
+
+def random_sample_0(data):
+    return torch.argmax(data)
+
+def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
+    print(
+        f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
+    )
+    data = torch.arange(voc).float() * 0.0001
+    _perm = torch.randperm(voc)
+    data = data[_perm].to(x_dtype).to(torch_device)
+    if(topp > 0 and topk > 1):
+        ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
+    else:
+        ans = random_sample_0(data)
+    indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
+    x_tensor = to_tensor(data, lib)
+    indices_tensor = to_tensor(indices, lib)
+    indices_tensor.descriptor.contents.dt = U64  # treat int64 as uint64
+
+    descriptor = infiniopRandomSampleDescriptor_t()
+    check_error(
+        lib.infiniopCreateRandomSampleDescriptor(
+            handle, ctypes.byref(descriptor), indices_tensor.descriptor, x_tensor.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    indices_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetRandomSampleWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, torch_device) 
+    check_error(
+        lib.infiniopRandomSample(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            indices_tensor.data,
+            x_tensor.data,
+            random_val,
+            topp,
+            topk,
+            temperature,
+            None,
+        )
+    )
+    if torch_device == "npu":
+        torch.npu.synchronize()
+
+    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
+    check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "cpu", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "cuda", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "mlu", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "npu", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # voc, random_val, topp, topk, temperature
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
+    ]
+    
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
+    lib.infiniopCreateRandomSampleDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRandomSampleDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
+    lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
+        infiniopRandomSampleDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopRandomSample.restype = c_int32
+    lib.infiniopRandomSample.argtypes = [
+        infiniopRandomSampleDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_uint64,
+        c_void_p,
+        c_float,
+        c_float,
+        c_int32,
+        c_float,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
+    lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
+        infiniopRandomSampleDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class RerrangeDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRearrangeDescriptor_t = POINTER(RerrangeDescriptor)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    x_stride,
+    y_shape,
+    y_stride,
+    x_dtype=torch.float16,
+):
+    print(
+        f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} x_dtype:{x_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device)
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+
+    descriptor = infiniopRearrangeDescriptor_t()
+    check_error(
+        lib.infiniopCreateRearrangeDescriptor(
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
+    )
+    assert torch.allclose(x, y, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle) 
+
+if __name__ == "__main__":
+    args = get_args()
+    test_cases = [
+        # ((src_shape, src_stride), (dst_shape, dst_stride))
+        (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
+        (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
+        (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
+        (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
+        (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
+        (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
+        (((64,), (1,)), ((64,), (1,))),
+        ]
+    lib = open_lib()
+    lib.infiniopCreateRearrangeDescriptor.restype = c_int32
+    lib.infiniopCreateRearrangeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRearrangeDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRearrange.restype = c_int32
+    lib.infiniopRearrange.argtypes = [
+        infiniopRearrangeDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
+    lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+class ReluDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopReluDescriptor_t = POINTER(ReluDescriptor)
+
+
+def relu(x):
+    if PROFILE:
+        ans = torch.nn.functional.relu(x).to(x.dtype)
+        torch.cuda.synchronize()
+        return ans
+    return torch.nn.functional.relu(x).to(x.dtype)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = relu(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = relu(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
+    descriptor = infiniopReluDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyReluDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((), Inplace.OUT_OF_PLACE),
+        ((), Inplace.INPLACE_X),
+        ((1, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3, 13, 9, 17), Inplace.INPLACE_X),
+        ((32, 20, 512), Inplace.INPLACE_X),
+        ((33, 333, 333), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReluDescriptor.restype = c_int32
+    lib.infiniopCreateReluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReluDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRelu.restype = c_int32
+    lib.infiniopRelu.argtypes = [
+        infiniopReluDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReluDescriptor.restype = c_int32
+    lib.infiniopDestroyReluDescriptor.argtypes = [
+        infiniopReluDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")