feat: cpu and cuda matmul

46da1a27 · PanZezhongQY · 46da1a27 · 46da1a27 · 46da1a27 · 46da1a27
Commit 46da1a27 authored Feb 11, 2025 by PanZezhongQY
7 changed files
--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+class RMSNormDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
+
+def rms_norm(x, w, eps):
+    input_dtype = x.dtype
+    hidden_states = x.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    return w * hidden_states.to(input_dtype)
+
+
+def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float16, w_dtype=torch.float16):
+    print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
+        f" dtype:{dtype} w_dtype:{w_dtype}")
+
+    y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
+    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
+    w = torch.ones(w_shape, dtype=w_dtype).to(torch_device)
+
+    eps = 1e-5
+    ans = rms_norm(x, w, eps)
+
+    y_tensor = to_tensor(y, lib)
+    x_tensor = to_tensor(x, lib)
+    w_tensor = to_tensor(w, lib)
+
+    descriptor = infiniopRMSNormDescriptor_t()
+    w_dataType = 0 if w_dtype==torch.float16 else 1
+
+    check_error(
+        lib.infiniopCreateRMSNormDescriptor(
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor,
+            w_tensor.descriptor, eps
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    w_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetRMSNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, y.device)
+    check_error(
+        lib.infiniopRMSNorm(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            y_tensor.data,
+            x_tensor.data,
+            w_tensor.data,
+            None,
+        )
+    )
+
+    assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
+    check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
+
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # y_shape, x_shape, w_shape, dtype, w_dtype
+        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16),
+        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateRMSNormDescriptor.restype = c_int32
+    lib.infiniopCreateRMSNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRMSNormDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+
+    lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
+        infiniopRMSNormDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopRMSNorm.restypes = c_int32
+    lib.infiniopRMSNorm.argtypes = [
+        infiniopRMSNormDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
+    lib.infiniopDestroyRMSNormDescriptor.argtypes = [
+        infiniopRMSNormDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/rotary_embedding.py
+++ b/test/infiniop/rotary_embedding.py
+import ctypes
+from ctypes import c_float, POINTER, c_void_p, c_int32, c_uint64, Structure, byref
+import sys
+import os
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+    U64,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class RoPEDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)
+
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[0], x.shape[-1])
+    shape = [d if i == 0 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+
+def rotary_embedding(t, pos, theta, torch_device):
+    dh = t.shape[2]
+    freqs = (1.0 / (theta ** (torch.arange(0, dh, 2)[: (dh // 2)].float() / dh))).to(
+        torch_device
+    )
+    freqs = torch.outer(pos, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    
+    t_ = torch.view_as_complex(t.reshape(*t.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, t_)
+    t_out = torch.view_as_real(t_ * freqs_cis).flatten(2).to(t.dtype)
+    return t_out
+
+def sin_cos_table(max_seq_len, dim, torch_device, theta):
+    pos = torch.arange(
+        0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
+    )
+    freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
+        torch_device
+    )
+    # (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2)
+    freqs = torch.repeat_interleave(freqs, repeats=2)
+    angles = torch.outer(pos, freqs)
+    return torch.sin(angles), torch.cos(angles)
+
+
+def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
+    print(
+        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
+    )
+
+    t = torch.rand(shape, dtype=dtype)
+    if strides is not None:
+        t = rearrange_tensor(t, strides)
+    posTmp = torch.arange(0, t.shape[0])
+    pos = torch.zeros(2 * posTmp.shape[0], dtype = torch.int32)
+    for i in range(posTmp.shape[0]):
+        pos[2 * i] = posTmp[i]
+        pos[2 * i + 1] = 0
+    theta = 1e4
+    if torch_device == 'mlu' or torch_device == 'npu':
+        ans = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device)
+        pos = pos.to(torch_device)
+        t = t.to(torch_device)
+    else:
+        t = t.to(torch_device)
+        pos = pos.to(torch_device)
+        ans = rotary_embedding(t, posTmp.to(torch_device), theta, torch_device)
+
+    descriptor = infiniopRoPEDescriptor_t()
+    # 2x table length for test
+    sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
+    t_tensor = to_tensor(t, lib)
+    pos_tensor = to_tensor(pos[: t.shape[0]], lib)
+    pos_tensor.descriptor.contents.dt = U64
+    sin_table_tensor = to_tensor(sin_table, lib)
+    cos_table_tensor = to_tensor(cos_table, lib)
+
+    if torch_device == "npu":
+        torch.npu.synchronize() 
+
+    check_error(
+        lib.infiniopCreateRoPEDescriptor(
+            handle,
+            byref(descriptor),
+            t_tensor.descriptor,
+            pos_tensor.descriptor,
+            sin_table_tensor.descriptor,
+            cos_table_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    t_tensor.descriptor.contents.invalidate()
+    pos_tensor.descriptor.contents.invalidate()
+    sin_table_tensor.descriptor.contents.invalidate()
+    cos_table_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, t.device)
+    check_error(
+        lib.infiniopRoPE(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            t_tensor.data,
+            pos_tensor.data,
+            sin_table_tensor.data,
+            cos_table_tensor.data,
+            None,
+        )
+    )
+
+    assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2)
+    check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "cpu", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "cuda", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "mlu", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+
+def test_ascend(lib, test_cases) :
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "npu", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        ((1, 32, 128), None, torch.float16),
+        ((1, 32, 64), None, torch.float16),
+        # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
+        # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
+        ((4, 1, 32), None, torch.float16),
+        ((1, 32, 128), None, torch.float16),
+        
+        ((3, 32, 128), (8000, 200, 1), torch.float16),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateRoPEDescriptor.restype = c_int32
+    lib.infiniopCreateRoPEDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRoPEDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
+    lib.infiniopGetRoPEWorkspaceSize.argtypes = [
+        infiniopRoPEDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopRoPE.restype = c_int32
+    lib.infiniopRoPE.argtypes = [
+        infiniopRoPEDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRoPEDescriptor.restype = c_int32
+    lib.infiniopDestroyRoPEDescriptor.argtypes = [
+        infiniopRoPEDescriptor_t,
+    ]
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class SwiGLUDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
+
+
+def swiglu(a, b):
+    
+    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
+
+def test_out_of_place(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
+    )
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None:
+        c = rearrange_tensor(c, c_stride)
+    ans = swiglu(a, b)
+
+    if sync is not None:
+        sync()
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib)
+    descriptor = infiniopSwiGLUDescriptor_t()
+    check_error(
+        lib.infiniopCreateSwiGLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    c_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopSwiGLU(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    )
+
+    assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
+    print("out-of-place Test passed!")
+
+    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+
+
+def test_in_place1(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    ans = swiglu(a, b)
+
+    if sync is not None:
+        sync()
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    descriptor = infiniopSwiGLUDescriptor_t()
+    check_error(
+        lib.infiniopCreateSwiGLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            a_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopSwiGLU(
+            descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    )
+
+    assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
+    print("in-place1 Test passed!")
+
+    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+
+
+def test_in_place2(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    ans = swiglu(a, b)
+
+    if sync is not None:
+        sync()
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    descriptor = infiniopSwiGLUDescriptor_t()
+    check_error(
+        lib.infiniopCreateSwiGLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            b_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopSwiGLU(
+            descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    )
+
+    assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2)
+
+    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle)
+
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize
+        )
+        test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
+        test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
+
+    destroy_handle(lib, handle) 
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # shape, a_stride, b_stride, c_stride, dtype
+        ((13, 4), None, None, None, torch.float16),
+        ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16),
+        ((16, 5632), None, None, None, torch.float16),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
+    lib.infiniopCreateSwiGLUDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopSwiGLUDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopSwiGLU.restype = c_int32
+    lib.infiniopSwiGLU.argtypes = [
+        infiniopSwiGLUDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
+    lib.infiniopDestroySwiGLUDescriptor.argtypes = [
+        infiniopSwiGLUDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/test_utils.py
+++ b/test/infiniop/test_utils.py
+def get_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Test Operator")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Whether profile tests",
+    )
+    parser.add_argument(
+        "--cpu",
+        action="store_true",
+        help="Run CPU test",
+    )
+    parser.add_argument(
+        "--nvidia",
+        action="store_true",
+        help="Run NVIDIA GPU test",
+    )
+    parser.add_argument(
+        "--cambricon",
+        action="store_true",
+        help="Run Cambricon MLU test",
+    )
+    parser.add_argument(
+        "--ascend",
+        action="store_true",
+        help="Run ASCEND NPU test",
+    )
+
+    return parser.parse_args()
+
+
+def synchronize_device(torch_device):
+    import torch
+    if torch_device == "cuda":
+        torch.cuda.synchronize()
+    elif torch_device == "npu":
+        torch.npu.synchronize()
+    elif torch_device == "mlu":
+        torch.mlu.synchronize()
--- a/xmake.lua
+++ b/xmake.lua
+add_rules("mode.debug", "mode.release")
+-- Define color codes
+local GREEN = '\27[0;32m'
+local YELLOW = '\27[1;33m'
+local NC = '\27[0m'  -- No Color
+
+add_includedirs("include")
+
+if is_mode("debug") then
+    add_cxflags("-g -O0")
+    add_defines("DEBUG_MODE")
+end
+
+-- CPU
+option("cpu")
+    set_default(true)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for CPU")
+option_end()
+
+option("omp")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable OpenMP support for cpu kernel")
+option_end()
+
+if has_config("cpu") then
+    includes("xmake/cpu.lua")
+    add_defines("ENABLE_CPU_API")
+end
+
+-- 英伟达
+option("nv-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for Nvidia GPU")
+option_end()
+
+if has_config("nv-gpu") then
+    add_defines("ENABLE_CUDA_API")
+    includes("xmake/cuda.lua")
+end
+
+-- 寒武纪
+option("cambricon-mlu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for Cambricon MLU")
+option_end()
+
+if has_config("cambricon-mlu") then
+    add_defines("ENABLE_CAMBRICON_API")
+end
+
+-- 华为昇腾
+option("ascend-npu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for Huawei Ascend NPU")
+option_end()
+
+if has_config("ascend-npu") then
+    add_defines("ENABLE_ASCEND_API")
+end
+
+-- 沐曦
+option("metax-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for MetaX GPU")
+option_end()
+
+if has_config("metax-gpu") then
+    add_defines("ENABLE_MACA_API")
+end
+
+-- 摩尔线程
+option("moore-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for Moore Threads GPU")
+option_end()
+
+if has_config("mthreads-gpu") then
+    add_defines("ENABLE_MUSA_API") 
+end 
+
+-- 海光
+option("sugon-dcu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie implementations for Sugon DCU")
+option_end()
+
+if has_config("sugon-dcu") then
+    add_defines("ENABLE_CUDA_API")
+    add_defines("ENABLE_SUGON_CUDA_API")
+end
+
+
+target("infiniop")
+    set_kind("shared")
+
+    if has_config("cpu") then
+        add_deps("infiniop-cpu")
+    end
+    if has_config("nv-gpu") then
+        add_deps("infiniop-cuda")
+    end
+    if has_config("sugon-dcu") then
+        local builddir = string.format(
+            "build/%s/%s/%s",
+            get_config("plat"),
+            get_config("arch"),
+            get_config("mode")
+        )
+        add_shflags("-s", "-shared", "-fPIC")
+        add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl")
+        -- Using -linfiniop-cuda will fail, manually link the target using full path
+        add_deps("nv-gpu", {inherit = false})
+        add_links(builddir.."/libinfiniop-cuda.a")
+        set_toolchains("sugon-dcu-linker")
+    end
+
+    if has_config("cambricon-mlu") then
+        add_deps("cambricon-mlu")
+    end
+    if has_config("ascend-npu") then
+        add_deps("ascend-npu")
+    end
+    if has_config("metax-gpu") then
+        add_deps("metax-gpu")
+    end
+    set_languages("cxx17")
+    add_files("src/infiniop/devices/handle.cc")
+    add_files("src/infiniop/ops/*/operator.cc")
+    add_files("src/infiniop/*.cc")
+    after_build(function (target) print(YELLOW .. "You can install the libraries with \"xmake install\"" .. NC) end)
+
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+    add_installfiles("include/infiniop/(**/*.h)", {prefixdir = "include/infiniop"})
+    add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"})
+    add_installfiles("include/infiniop.h", {prefixdir = "include"})
+    add_installfiles("include/infinicore.h", {prefixdir = "include"})
+
+target_end()
--- a/xmake/cpu.lua
+++ b/xmake/cpu.lua
+target("infiniop-cpu")
+    on_install(function (target) end)
+    set_kind("static")
+
+    if not is_plat("windows") then
+        add_cxflags("-fPIC")
+    end
+
+    set_languages("cxx17")
+    add_files("../src/infiniop/devices/cpu/*.cc", "../src/infiniop/ops/*/cpu/*.cc")
+    if has_config("omp") then
+        add_cxflags("-fopenmp")
+        add_ldflags("-fopenmp")
+    end
+target_end()
\ No newline at end of file
--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
+
+local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
+local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
+if CUDA_ROOT ~= nil then
+    add_includedirs(CUDA_ROOT .. "/include")
+end
+if CUDNN_ROOT ~= nil then
+    add_includedirs(CUDNN_ROOT .. "/include")
+end
+
+target("infiniop-cuda")
+    set_kind("static")
+    on_install(function (target) end)
+    set_policy("build.cuda.devlink", true)
+
+    set_toolchains("cuda")
+    add_links("cublas")
+    add_links("cudnn")
+    add_cugencodes("native")
+
+    if is_plat("windows") then
+        add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
+        if CUDNN_ROOT ~= nil then
+            add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
+        end
+    else
+        add_cuflags("-Xcompiler=-fPIC")
+        add_culdflags("-Xcompiler=-fPIC")
+        add_cxxflags("-fPIC")
+    end
+
+    set_languages("cxx17")
+    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
+target_end()