feat: cpu and cuda matmul

46da1a27 · PanZezhongQY · 46da1a27 · 46da1a27 · 46da1a27 · 46da1a27
Commit 46da1a27 authored Feb 11, 2025 by PanZezhongQY
20 changed files
--- a/test/infiniop/__init__.py
+++ b/test/infiniop/__init__.py
+import libinfiniop
\ No newline at end of file
--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+class AddDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAddDescriptor_t = POINTER(AddDescriptor)
+
+
+def add(x, y):
+    return torch.add(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+
+    ans = add(a, b)
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopAddDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateAddDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    c_tensor.descriptor.contents.invalidate()
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+    )
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
+        ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
+        ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateAddDescriptor.restype = c_int32
+    lib.infiniopCreateAddDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAddDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopAdd.restype = c_int32
+    lib.infiniopAdd.argtypes = [
+        infiniopAddDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAddDescriptor.restype = c_int32
+    lib.infiniopDestroyAddDescriptor.argtypes = [
+        infiniopAddDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import torch.nn.functional as F
+
+
+class AttentionDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
+
+
+def causal_softmax(x):
+    type = x.dtype
+    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    y = x.clone()
+    masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
+    return torch.nn.functional.softmax(masked, dim=-1).to(type)
+
+
+def attention(q, k, v, k_cache, v_cache, pos):
+    type = q.dtype
+
+    n_q_head = q.shape[0]
+    n_kv_head = k.shape[0]
+
+    # Concatenate key and value caches
+    k_cache = k_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    v_cache = v_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    k = torch.cat([k_cache, k], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+    v = torch.cat([v_cache, v], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+
+    total_seq_len = k.shape[1]
+
+    head_dim = v.shape[-1]
+
+    if n_q_head != n_kv_head:
+        q = q.reshape(
+            n_kv_head, -1, head_dim
+        )  # (n_kv_head, n_group * seq_len, head_dim)
+
+    # Scaled dot-product attention
+    attn_scores = (
+        torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32))
+        .to(type)
+        .reshape(n_q_head, -1, total_seq_len)
+    )  # (n_q_head, seq_len, total_seq_len)
+    attn_scores = attn_scores / (head_dim**0.5)
+
+    attn_weights = causal_softmax(attn_scores).reshape(
+        n_kv_head, -1, total_seq_len
+    )  # (n_kv_head, seq_len, total_seq_len)
+
+    # Weighted sum of values
+    attn_output = (
+        torch.einsum(
+            "hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32)
+        )
+        .to(type)
+        .reshape(n_q_head, -1, head_dim)
+        .permute(1, 0, 2)
+    )  # ([seq_len, n_q_head, head_dim])
+
+    return attn_output
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    n_q_head,
+    n_kv_head,
+    seq_len,
+    head_dim,
+    pos,
+    k_cache_buf_len,
+    v_cache_buf_len,
+    dtype=torch.float16,
+    q_stride=None,
+    k_stride=None,
+    v_stride=None,
+    k_cache_stride=None,
+    v_cache_stride=None,
+):
+    print(
+        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
+        f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
+    )
+
+    out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
+    q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k_cache = (
+        torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        * 0.1
+    )
+    v_cache = (
+        torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        * 0.1
+    )
+
+    ans = attention(q, k, v, k_cache, v_cache, pos)
+
+    if q_stride is not None:
+        q = rearrange_tensor(q, q_stride)
+    if k_stride is not None:
+        k = rearrange_tensor(k, k_stride)
+    if v_stride is not None:
+        v = rearrange_tensor(v, v_stride)
+    if k_cache_stride is not None:
+        k_cache = rearrange_tensor(k_cache, k_cache_stride)
+    if v_cache_stride is not None:
+        v_cache = rearrange_tensor(v_cache, v_cache_stride)
+
+    out_tensor = to_tensor(out, lib)
+    q_tensor = to_tensor(q, lib)
+    k_tensor = to_tensor(k, lib)
+    v_tensor = to_tensor(v, lib)
+    k_cache_tensor = to_tensor(k_cache, lib)
+    v_cache_tensor = to_tensor(v_cache, lib)
+
+    descriptor = infiniopAttentionDescriptor_t()
+    check_error(
+        lib.infiniopCreateAttentionDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out_tensor.descriptor,
+            q_tensor.descriptor,
+            k_tensor.descriptor,
+            v_tensor.descriptor,
+            k_cache_tensor.descriptor,
+            v_cache_tensor.descriptor,
+            pos,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    out_tensor.descriptor.contents.invalidate()
+    q_tensor.descriptor.contents.invalidate()
+    k_tensor.descriptor.contents.invalidate()
+    v_tensor.descriptor.contents.invalidate()
+    k_cache_tensor.descriptor.contents.invalidate()
+    v_cache_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, out.device)
+
+    check_error(
+        lib.infiniopAttention(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            out_tensor.data,
+            q_tensor.data,
+            k_tensor.data,
+            v_tensor.data,
+            k_cache_tensor.data,
+            v_cache_tensor.data,
+            None,
+        )
+    )
+
+    assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
+
+    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # prefill
+        (
+            32,  # n_q_head
+            4,  # n_kv_head
+            5,  # seq_len
+            64,  # head_dim
+            0,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            torch.float16,  # dtype
+            [64, 2560, 1],  # q_stride
+            [64, 2560, 1],  # k_stride
+            [64, 2560, 1],  # v_stride
+            [64, 11264, 1],  # k_cache_stride
+            [64, 11264, 1],  # v_cache_stride
+        ),
+        # decode
+        (
+            32,  # n_q_head
+            4,  # n_kv_head
+            1,  # seq_len
+            64,  # head_dim
+            3,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            torch.float16,  # dtype
+            [64, 2560, 1],  # q_stride
+            [64, 2560, 1],  # k_stride
+            [64, 2560, 1],  # v_stride
+            [64, 11264, 1],  # k_cache_stride
+            [64, 11264, 1],  # v_cache_stride
+        ),
+        # for test
+        (
+            8,  # n_q_head
+            4,  # n_kv_head
+            2,  # seq_len
+            16,  # head_dim
+            1,  # pos
+            8,  # k_cache_buf_len
+            8,  # v_cache_buf_len
+            torch.float16,  # dtype
+            None,  # q_stride
+            None,  # k_stride
+            None,  # v_stride
+            None,  # k_cache_stride
+            None,  # v_cache_stride
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateAttentionDescriptor.restype = c_int32
+    lib.infiniopCreateAttentionDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAttentionDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_uint64,
+    ]
+
+    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
+    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
+        infiniopAttentionDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopAttention.restype = c_int32
+    lib.infiniopAttention.argtypes = [
+        infiniopAttentionDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
+    lib.infiniopDestroyAttentionDescriptor.argtypes = [
+        infiniopAttentionDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class AvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
+
+
+def pool(x, k, padding, stride, dilation = 1):
+    pooling_layers = {
+        1: torch.nn.AvgPool1d,
+        2: torch.nn.AvgPool2d,
+        3: torch.nn.AvgPool3d,
+    }
+
+    ndim = len(x.shape) - 2
+    if ndim not in pooling_layers:
+        print("Error: Pytorch -> Unsupported tensor dimension")
+        return None
+
+    if ndim == 3 and x.dtype == torch.float16:
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
+    else:
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return ans
+
+
+def inferShape(x_shape, kernel_shape, padding, strides):
+    assert (
+        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
+    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
+    input_shape = x_shape[2:]
+    output_shape = []
+
+    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
+        output_dim = (dim + 2 * p - k) // s + 1
+        output_shape.append(output_dim)
+
+    return x_shape[:2] + tuple(output_shape)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape, 
+    k_shape, 
+    padding,
+    strides,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = pool(x, k_shape, padding, strides)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            tuple_to_void_p(k_shape),
+            tuple_to_void_p(padding),
+            tuple_to_void_p(strides),
+            len(k_shape),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopAvgPool(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, kernel_shape, padding, strides
+        ((1, 1, 10), (3,), (1,), (1,)),
+        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
+        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopAvgPool.restype = c_int32
+    lib.infiniopAvgPool.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
+import sys
+import os
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class CausalSoftmaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
+
+
+def causal_softmax(x):
+    type = x.dtype
+    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    y = x.clone()
+    masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
+    return torch.nn.functional.softmax(masked, dim=-1).to(type)
+
+
+def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16):
+    print(
+        f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    ans = causal_softmax(x)
+    x_tensor = to_tensor(x, lib)
+    descriptor = infiniopCausalSoftmaxDescriptor_t()
+    check_error(
+        lib.infiniopCreateCausalSoftmaxDescriptor(
+            handle, ctypes.byref(descriptor), x_tensor.descriptor
+        )
+    )
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetCausalSoftmaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+
+    workspace = create_workspace(workspace_size.value, x.device)
+    check_error(
+        lib.infiniopCausalSoftmax(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            x_tensor.data,
+            None,
+        )
+    )
+    assert torch.allclose(x, ans, atol=0, rtol=1e-2)
+    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cpu", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cuda", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "mlu", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "npu", x_shape, x_stride)
+
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, x_stride
+        ((32, 20, 512), None),
+        ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopCausalSoftmaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopCausalSoftmax.restype = c_int32
+    lib.infiniopCausalSoftmax.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import math
+import ctypes
+from torch.nn import functional as F
+from typing import List, Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ConvDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopConvDescriptor_t = POINTER(ConvDescriptor)
+
+
+def conv(x, w, stride, padding, dilation):
+    match len(x.shape) - 2:
+        case 1:
+            return F.conv1d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case 2:
+            return F.conv2d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case 3:
+            return F.conv3d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case _:
+            print("Error: Pytorch -> Unsupported tensor dimension")
+            return None
+
+
+# infer the shape of the output given the inputs for a N-ary convolution
+def inferShape(
+    x_shape: List[int],
+    w_shape: List[int],
+    pads: List[int],
+    strides: List[int],
+    dilations: List[int],
+) -> Tuple[int, ...]:
+    assert (
+        len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
+    ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
+    output_dims = [
+        math.floor(
+            (x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
+            / strides[i]
+            + 1
+        )
+        for i in range(len(pads))
+    ]
+    return (x_shape[0], w_shape[0]) + tuple(output_dims)
+
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    w_shape,
+    pads,
+    strides,
+    dilations,
+    tensor_stride=None,
+    tensor_dtype=torch.float16,
+):
+    assert len(pads) == len(strides) == len(dilations)
+    print(
+        f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(
+        inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
+    ).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = conv(x, w, strides, pads, dilations)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = conv(x, w, strides, pads, dilations)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    w_tensor = to_tensor(w, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopConvDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateConvDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w_tensor.descriptor,
+            tuple_to_void_p(pads),
+            tuple_to_void_p(strides),
+            tuple_to_void_p(dilations),
+            len(pads),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    w_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopConv(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                w_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopConv(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    else:
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyConvDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, w_shape, pads, strides, dilations, x_strides
+        (
+            (32, 3, 4),
+            (32, 3, 5),
+            (1,),
+            (1,),
+            (1,),
+            None,
+        ),
+        (
+            (1, 3, 4, 4),
+            (2, 3, 3, 3),
+            (1, 1),
+            (1, 2),
+            (2, 1),
+            None,
+        ),
+        (
+            (32, 3, 128, 128),
+            (64, 3, 5, 5),
+            (2, 2),
+            (2, 2),
+            (1, 1),
+            None,
+        ),
+        (
+            (1, 1, 4, 4, 4),
+            (1, 1, 5, 5, 5),
+            (1, 1, 1),
+            (1, 1, 1),
+            (1, 1, 1),
+            None,
+        ),
+        (
+            (32, 3, 32, 32, 32),
+            (64, 3, 5, 5, 5),
+            (3, 2, 2),
+            (4, 3, 3),
+            (2, 2, 1),
+            None,
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateConvDescriptor.restype = c_int32
+    lib.infiniopCreateConvDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopConvDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopConv.restype = c_int32
+    lib.infiniopConv.argtypes = [
+        infiniopConvDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyConvDescriptor.restype = c_int32
+    lib.infiniopDestroyConvDescriptor.argtypes = [
+        infiniopConvDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ExpandDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
+
+
+def expand(x, y):
+    if PROFILE:
+        ans = x.expand_as(y).clone()
+        torch.cuda.synchronize()
+        return ans
+    return x.expand_as(y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    y_shape, 
+    x_shape,
+    y_stride=None, 
+    x_stride=None, 
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
+
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = expand(x, y)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = expand(x, y)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopExpandDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateExpandDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # y_shape, x_shape, y_stride, x_stride
+        ((), (), None, None),
+        ((3, 3), (1,), None, None),
+        ((5, 4, 3), (4, 3,), None, (6, 1)),
+        ((99, 111), (111,), None, None),
+        ((2, 4, 3), (1, 3), None, None),
+        ((2, 20, 3), (2, 1, 3), None, None),
+        ((2, 3, 4, 5), (5,), None, None),
+        ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
+        ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateExpandDescriptor.restype = c_int32
+    lib.infiniopCreateExpandDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopExpandDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopExpand.restype = c_int32
+    lib.infiniopExpand.argtypes = [
+        infiniopExpandDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyExpandDescriptor.restype = c_int32
+    lib.infiniopDestroyExpandDescriptor.argtypes = [
+        infiniopExpandDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class GEMMDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
+
+
+def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
+    A = A.T if transA else A
+    B = B.T if transB else B
+    result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
+    if C is not None:
+        result += beta * C if dtype != torch.float16 else C.to(torch.float32)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return result
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    alpha,
+    beta,
+    transA,
+    transB,
+    a_shape,
+    b_shape,
+    c_shape,
+    y_shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+):
+    print(
+        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " 
+        f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
+        f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
+    )
+
+    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
+    c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
+    y = torch.rand(y_shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None and c is not None:
+        c = rearrange_tensor(c, c_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib) if c is not None else None
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGEMMDescriptor_t()
+    check_error(
+        lib.infiniopCreateGEMMDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+            c_tensor.descriptor if c_tensor else None,
+            alpha,
+            beta,
+            transA,
+            transB,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    if c_tensor is not None:
+        c_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGEMMWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGEMM(
+                descriptor,
+                workspace_ptr,
+                workspace_size,
+                y_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                c_tensor.data if c_tensor else None,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGEMM(
+                    descriptor,
+                    workspace_ptr,
+                    workspace_size,
+                    y_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    c_tensor.data if c_tensor else None,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
+        (
+            1.0,
+            1.0,
+            False,
+            False,
+            (1, 2048),
+            (2048, 2048),
+            (1, 2048),
+            (1, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            True,
+            (2048, 4),
+            (2048, 2048),
+            (4, 2048),
+            (4, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            False,
+            True,
+            (1, 2048),
+            (1000, 2048),
+            (1000),
+            (1, 1000),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            False,
+            (2048, 4),
+            (2048, 2048),
+            (2048),
+            (4, 2048),
+            (4096, 1),
+            (4096, 1),
+            (2,),
+            (4096, 1),
+        ),
+        (
+            1.0,
+            1.0,
+            False,
+            False,
+            (3, 1, 2048),
+            (3, 2048, 2048),
+            (1,),
+            (3, 1, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            False,
+            (2048, 4),
+            (2048, 2048),
+            None,
+            (4, 2048),
+            (4096, 1),
+            (4096, 1),
+            (2,),
+            (4096, 1),
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateGEMMDescriptor.restype = c_int32
+    lib.infiniopCreateGEMMDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGEMMDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+        c_bool,
+        c_bool,
+    ]
+
+    lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
+    lib.infiniopGetGEMMWorkspaceSize.argtypes = [
+        infiniopGEMMDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopGEMM.restype = c_int32
+    lib.infiniopGEMM.argtypes = [
+        infiniopGEMMDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGEMMDescriptor.restype = c_int32
+    lib.infiniopDestroyGEMMDescriptor.argtypes = [
+        infiniopGEMMDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch, time
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class GlobalAvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
+
+
+def inferShape(x):
+    return x.shape[:2] + (1,) * (x.dim() - 2)
+
+
+def globalAvgPool(x):
+    y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return y.view(*inferShape(x))
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = globalAvgPool(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = globalAvgPool(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateGlobalAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGlobalAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspaceSize)
+        )
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGlobalAvgPool(
+                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGlobalAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape
+        ((1, 3, 3)),
+        ((1, 3, 1, 1, 3)),
+        ((1, 3, 1, 1, 257)),
+        ((1, 2, 1, 1, 514)),
+        ((1, 3, 1, 1, 1025)),
+        ((32, 256, 1, 112, 112)),
+        ((2, 3, 2048000)),
+        ((2, 1, 10243)),
+        ((2, 20, 100)),
+        ((3, 33, 333)),
+        ((32, 20, 512)),
+        ((3, 3, 11, 11, 11, 3, 2)),
+        ((32, 256, 1, 112, 112)),
+        ((32, 256, 112, 112)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGlobalAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopGlobalAvgPool.restype = c_int32
+    lib.infiniopGlobalAvgPool.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/libinfiniop/__init__.py
+++ b/test/infiniop/libinfiniop/__init__.py
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
+from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
+from .devices import *
+from .utils import *
+from .datatypes import *
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
+class InfiniDtype:
+    INVALID = 0
+    BYTE = 1
+    BOOL = 2
+    I8 = 3
+    I16 = 4
+    I32 = 5
+    I64 = 6
+    U8 = 7
+    U16 = 8 
+    U32 = 9
+    U64 = 10
+    F8 = 11
+    F16 = 12
+    F32 = 13
+    F64 = 14
+    C8 = 15
+    C16 = 16
+    C32 = 17
+    C64 = 18
+    BF16 = 19
--- a/test/infiniop/libinfiniop/devices.py
+++ b/test/infiniop/libinfiniop/devices.py
+class InfiniDeviceEnum:
+    CPU = 0
+    NVIDIA = 1
+    CAMBRICON = 2
+    ASCEND = 3
+    METAX = 4,
+    MOORE = 5,
+    ILUVATAR = 6,
+    KUNLUN = 7,
+    SUGON = 8,
--- a/test/infiniop/libinfiniop/liboperators.py
+++ b/test/infiniop/libinfiniop/liboperators.py
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
--- a/test/infiniop/matmul.py
+++ b/test/infiniop/matmul.py
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
--- a/test/infiniop/mlp.py
+++ b/test/infiniop/mlp.py
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py