Merge remote-tracking branch 'origin/main' into issue/142

c2e87202 · Catheriany · 41818f84 · c203635b · c2e87202 · c2e87202
Commit c2e87202 authored Jun 04, 2025 by Catheriany
15 changed files
--- a/test/infiniop/mlp.py
+++ b/test/infiniop/mlp.py
@@ -65,6 +65,7 @@ def test(
    y_stride=None,
    w12_stride=None,
    w3_stride=None,
+    sync=None
 ):
    print(
        f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
@@ -97,6 +98,10 @@ def test(
    x_tensor = to_tensor(x, lib)
    w12_tensor = to_tensor(w12, lib)
    w3_tensor = to_tensor(w3, lib)
+    
+    if sync is not None:
+        sync()
+
    descriptor = infiniopMLPDescriptor_t()
    check_error(
        lib.infiniopCreateMLPDescriptor(

--- a/test/infiniop/mul.py
+++ b/test/infiniop/mul.py
+import torch
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from libinfiniop import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    open_lib,
+    to_tensor,
+    get_test_devices,
+    check_error,
+    rearrange_if_needed,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    create_workspace,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class MulDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMulDescriptor_t = POINTER(MulDescriptor)
+
+
+def mul(x, y):
+    return torch.mul(x, y)
+
+
+def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
+    """
+    rearrange the tensors if needed and apply the inplace config.
+    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
+    the inplace config is ignored and out-of-place is used
+    """
+    original_c_strides = c_strides if c_strides else c.stride()
+
+    def _rearrange(tensor, strides):
+        if strides and 0 in strides:
+            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
+            return tensor
+        else:
+            return rearrange_if_needed(tensor, strides)
+
+    a, b, c = [
+        _rearrange(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
+    ]
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
+    if 0 in c.stride():
+        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
+
+    return a, b, c
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{dtype} inplace:{inplace}"
+    )
+
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
+    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
+
+    ans = mul(a, b)
+
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopMulDescriptor_t()
+    check_error(
+        lib.infiniopCreateMulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a_tensor, b_tensor, c_tensor]:
+        tensor.destroyDesc(lib)
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, c.device)
+
+    def lib_mul():
+        check_error(
+            lib.infiniopMul(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                None,
+            )
+        )
+
+    lib_mul()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(lib.infiniopDestroyMulDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateMulDescriptor.restype = c_int32
+    lib.infiniopCreateMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMulDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMulWorkspaceSize.argtypes = [
+        infiniopMulDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopMul.restype = c_int32
+    lib.infiniopMul.argtypes = [
+        infiniopMulDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMulDescriptor.restype = c_int32
+    lib.infiniopDestroyMulDescriptor.argtypes = [
+        infiniopMulDescriptor_t,
+    ]
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
+
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -103,6 +103,7 @@ def test(
    topk,
    temperature,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
@@ -122,6 +123,9 @@ def test(

    indices_tensor.descriptor.contents.dt = InfiniDtype.U64  # treat int64 as uint64

+    if sync is not None:
+        sync()
+
    descriptor = infiniopRandomSampleDescriptor_t()
    check_error(
        lib.infiniopCreateRandomSampleDescriptor(

--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -17,19 +17,88 @@ from libinfiniop import (
    profile_operation,
 )

+def row_major_strides(shape):
+    """生成张量的行优先(C风格)stride
+    
+    Args:
+        shape: 张量形状
+    
+    Returns:
+        行优先strides列表
+    """
+    # 行优先 (C风格，从最后一维到第一维)
+    stride = 1
+    strides = [1]
+    for dim in reversed(shape[1:]):
+        stride *= dim
+        strides.insert(0, stride)
+    return strides
+
+def column_major_strides(shape):
+    """生成张量的列优先(Fortran风格)stride
+    
+    Args:
+        shape: 张量形状
+    
+    Returns:
+        列优先strides列表
+    """
+    # 列优先 (Fortran风格，从第一维到最后一维)
+    stride = 1
+    strides = [stride]
+    for dim in shape[:-1]:
+        stride *= dim
+        strides.append(stride)
+    return strides
+
+
+
 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES = [
-    # ((src_shape, src_stride), (dst_shape, dst_stride))
-    (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
-    (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
-    (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
-    (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
-    (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
-    (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
-    (((64,), (1,)), ((64,), (1,))),
+    # (shape, x_stride, y_stride)
+    (
+        (2, 4, 64),  # shape
+        (2, 4, 8),   # x_stride
+        (512, 128, 2) # y_stride
+    ),
+    (
+        (100, 100),  # shape
+        (1, 100),    # x_stride
+        (100, 1)     # y_stride
+    ),
+    (
+        (4, 4),      # shape
+        (1, 4),      # x_stride
+        (4, 1)       # y_stride
+    ),
+    (
+        (4, 6, 64),  # shape
+        (64, 4*64, 1), # x_stride
+        (6*64, 64, 1)  # y_stride
+    ),
+    (
+        (2000, 2000), # shape
+        (1, 2000),    # x_stride
+        (2000, 1)     # y_stride
+    ),
+    (
+        (2001, 2001), # shape
+        (1, 2001),    # x_stride
+        (2001, 1)     # y_stride
+    ),
+    (
+        (3, 4, 7, 53, 9), # shape
+        row_major_strides((3, 4, 7, 53, 9)), # x_stride
+        column_major_strides((3, 4, 7, 53, 9)) # y_stride
+    ),
+    (
+        (3, 4, 50, 50, 5, 7), # shape
+        row_major_strides((3, 4, 50, 50, 5, 7)),  # x_stride
+        column_major_strides((3, 4, 50, 50, 5, 7)) # y_stride
+    ),
 ]

 # Data types used for testing
@@ -58,24 +127,28 @@ def test(
    lib,
    handle,
    torch_device,
-    x_shape,
+    shape,
    x_stride,
-    y_shape,
    y_stride,
    dtype=torch.float16,
+    sync=None
 ):
    print(
-        f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} dtype:{dtype}"
+        f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
    )

-    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
-    y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
+    x = torch.rand(shape, dtype=dtype).to(torch_device)
+    y = torch.zeros(shape, dtype=dtype).to(torch_device)

    x, y = [
        rearrange_if_needed(tensor, stride)
        for tensor, stride in zip([x, y], [x_stride, y_stride])
    ]
+
    x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
+    
+    if sync is not None:
+        sync()

    descriptor = infiniopRearrangeDescriptor_t()
    check_error(
@@ -86,7 +159,7 @@ def test(

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
    for tensor in [x_tensor, y_tensor]:
-        tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)

    def lib_rearrange():
        check_error(

--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
@@ -55,6 +55,7 @@ def test(
    tensor_shape,
    tensor_dtype=torch.float16,
    inplace=Inplace.OUT_OF_PLACE,
+    sync=None
 ):
    print(
        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
@@ -78,8 +79,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
-    descriptor = infiniopReluDescriptor_t()

+    if sync is not None:
+        sync()    
+
+    descriptor = infiniopReluDescriptor_t()
    check_error(
        lib.infiniopCreateReluDescriptor(
            handle,

--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -72,6 +72,7 @@ def test(
    x_stride,
    w_dtype=torch.float16,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
@@ -89,9 +90,11 @@ def test(
        rearrange_if_needed(tensor, stride)
        for tensor, stride in zip([x, y], [x_stride, y_stride])
    ]
-
    x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
-
+    
+    if sync is not None:
+        sync()
+    
    descriptor = infiniopRMSNormDescriptor_t()

    check_error(

--- a/test/infiniop/rotary_embedding.py
+++ b/test/infiniop/rotary_embedding.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
 from libinfiniop import (
-    InfiniDtype,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
    open_lib,
@@ -18,30 +17,49 @@ from libinfiniop import (
    profile_operation,
    synchronize_device,
 )
+from enum import Enum, auto

 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
-_TEST_CASES = [
-    # (t_shape, t_strides)
-    ((1, 32, 128), None),
-    ((1, 32, 64), None),
+_TEST_CASES_ = [
+    # (shape, x_strides, y_strides)
+    ((1, 32, 128), None, None),
+    ((10, 32, 64), None, None),
    # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
    # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
-    ((4, 1, 32), None),
-    ((1, 32, 128), None),
-    ((3, 32, 128), (8000, 200, 1)),
+    ((4, 1, 32), (64, 64, 1), None),
+    ((11, 33, 128), None, (8000, 200, 1)),
+    ((3, 32, 128), (8000, 200, 1), (7000, 128, 1)),
 ]

 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16]
+_TENSOR_DTYPES = [torch.float16, torch.float32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
+    torch.float32: {"atol": 1e-4, "rtol": 1e-3},
 }

+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
@@ -55,23 +73,21 @@ class RoPEDescriptor(Structure):
 infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)


-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[0], x.shape[-1])
-    shape = [d if i == 0 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-
-
-def rotary_embedding(t, pos, theta, torch_device):
+def rotary_embedding(t, sin, cos, torch_device):
    dh = t.shape[2]
+    dt = t.dtype
    assert dh % 2 == 0, "Embedding dimension must be even."
    t_even = t[..., 0::2]  # [seq_len, n_head, dh // 2]
    t_odd = t[..., 1::2]  # [seq_len, n_head, dh // 2]
-    freqs = (1.0 / (theta ** (torch.arange(0, dh, 2).float() / dh))).to(torch_device)
-    freqs = torch.outer(pos, freqs)  # [seq_len, dh // 2]
-    cos = torch.cos(freqs).unsqueeze(1)  # [seq_len, 1, dh // 2]
-    sin = torch.sin(freqs).unsqueeze(1)  # [seq_len, 1, dh // 2]
+    cos = cos.unsqueeze(1)  # [seq_len, 1, dh // 2]
+    sin = sin.unsqueeze(1)  # [seq_len, 1, dh // 2]
+    if torch_device == "cpu":
+        (t_even, t_odd, cos, sin) = (
+            t_even.float(),
+            t_odd.float(),
+            cos.float(),
+            sin.float(),
+        )

    t_out_even = t_even * cos - t_odd * sin
    t_out_odd = t_even * sin + t_odd * cos
@@ -80,60 +96,67 @@ def rotary_embedding(t, pos, theta, torch_device):
    t_out[..., 0::2] = t_out_even
    t_out[..., 1::2] = t_out_odd

-    return t_out
+    return t_out.to(dt).to(torch_device)


-def sin_cos_table(max_seq_len, dim, torch_device, theta):
-    pos = torch.arange(
-        0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
-    )
+def sin_cos_table(pos, dim, torch_device, theta, dtype):
+    assert dim % 2 == 0, "Embedding dimension must be even."
    freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
        torch_device
    )
-    # (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2)
-    freqs = torch.repeat_interleave(freqs, repeats=2)
    angles = torch.outer(pos, freqs)
-    return torch.sin(angles), torch.cos(angles)
-
-
-def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
+    return torch.sin(angles).to(dtype), torch.cos(angles).to(dtype)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    x_strides=None,
+    y_strides=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+    sync=None
+):
+    if inplace == Inplace.INPLACE_X:
+        y_strides = x_strides
    print(
-        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
+        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{dtype} inplace:{inplace}"
    )

-    t = torch.rand(shape, dtype=dtype)
+    x = torch.rand(shape, dtype=dtype).to(torch_device)
+    x = rearrange_if_needed(x, x_strides)
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = torch.rand(shape, dtype=dtype).to(torch_device)
+        y = rearrange_if_needed(y, y_strides)
+    theta = 1e5
+    pos = torch.arange(0, x.shape[0], dtype=torch.int32).to(torch_device)
+    sin_table, cos_table = sin_cos_table(pos, x.shape[2], x.device, theta, dtype)

-    t = rearrange_if_needed(t, strides)
-
-    posTmp = torch.arange(0, t.shape[0]).to(torch_device)
-    pos = torch.zeros(2 * posTmp.shape[0], dtype=torch.int32)
-    for i in range(posTmp.shape[0]):
-        pos[2 * i] = posTmp[i]
-        pos[2 * i + 1] = 0
-    pos = pos.to(torch_device)
-    theta = 1e4
-
-    ans = rotary_embedding(t, posTmp, theta, torch_device)
+    ans = rotary_embedding(x, sin_table, cos_table, torch_device)

    descriptor = infiniopRoPEDescriptor_t()
-    # 2x table length for test
-    sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
-
-    t_tensor, sin_table_tensor, cos_table_tensor = [
-        to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]
+    x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor = [
+        to_tensor(tensor, lib, force_unsigned=True)
+        for tensor in [x, pos, sin_table, cos_table]
    ]
+    if inplace == Inplace.INPLACE_X:
+        y_tensor = x_tensor
+    else:
+        y_tensor = to_tensor(y, lib)

-    pos_tensor = to_tensor(pos[: t.shape[0]], lib)
-    pos_tensor.descriptor.contents.dtype = InfiniDtype.U64
-
-    if torch_device == "npu":
-        synchronize_device(torch_device)
+    if sync is not None:
+        sync()

    check_error(
        lib.infiniopCreateRoPEDescriptor(
            handle,
            ctypes.byref(descriptor),
-            t_tensor.descriptor,
+            y_tensor.descriptor,
+            x_tensor.descriptor,
            pos_tensor.descriptor,
            sin_table_tensor.descriptor,
            cos_table_tensor.descriptor,
@@ -141,14 +164,14 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [t_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
-        tensor.descriptor.contents.invalidate()
+    for tensor in [y_tensor, x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
+        tensor.destroyDesc(lib)

    workspace_size = c_uint64(0)
    check_error(
        lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
    )
-    workspace = create_workspace(workspace_size.value, t.device)
+    workspace = create_workspace(workspace_size.value, x.device)

    def lib_rope():
        check_error(
@@ -156,7 +179,8 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
                descriptor,
                workspace.data_ptr() if workspace is not None else None,
                workspace_size.value,
-                t_tensor.data,
+                y_tensor.data,
+                x_tensor.data,
                pos_tensor.data,
                sin_table_tensor.data,
                cos_table_tensor.data,
@@ -165,16 +189,19 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
        )

    lib_rope()
+    
+    if sync is not None:
+        sync()

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(t, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(t, ans, atol=atol, rtol=rtol)
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)

    if PROFILE:
        profile_operation(
            "PyTorch",
-            lambda: rotary_embedding(t, posTmp, theta, torch_device),
+            lambda: rotary_embedding(x, pos, theta, torch_device),
            torch_device,
            NUM_PRERUN,
            NUM_ITERATIONS,
@@ -232,5 +259,5 @@ if __name__ == "__main__":
    # Execute tests
    for device in get_test_devices(args):
        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
-        
+
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
 import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_void_p
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
 from libinfiniop import (
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
@@ -14,6 +14,7 @@ from libinfiniop import (
    debug,
    get_tolerance,
    profile_operation,
+    create_workspace
 )
 from enum import Enum, auto

@@ -25,8 +26,10 @@ _TEST_CASES_ = [
    # shape, a_stride, b_stride, c_stride
    ((13, 4), None, None, None),
    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
    ((13, 4, 4), None, None, None),
    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
    ((16, 5632), None, None, None),
    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
    ((4, 4, 5632), None, None, None),
@@ -58,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 2e-7, "rtol": 1e-7},
 }

 DEBUG = False
@@ -76,6 +80,38 @@ infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)

 def swiglu(a, b):
    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
+    
+
+
+def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
+    """
+    rearrange the tensors if needed and apply the inplace config.
+    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
+    the inplace config is ignored and out-of-place is used
+    """
+    original_c_strides = c_strides if c_strides else c.stride()
+
+    def _rearrange(tensor, strides):
+        if strides and 0 in strides:
+            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
+            return tensor
+        else:
+            return rearrange_if_needed(tensor, strides)
+
+    a, b, c = [
+        _rearrange(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
+    ]
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
+    if 0 in c.stride():
+        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
+
+    return a, b, c


 def test(
@@ -98,18 +134,10 @@ def test(
    a = torch.rand(shape, dtype=dtype).to(torch_device)
    b = torch.rand(shape, dtype=dtype).to(torch_device)
    c = torch.rand(shape, dtype=dtype).to(torch_device)
+    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)

    ans = swiglu(a, b)

-    a, b, c = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
-    ]
-    c = (
-        c
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
    c_tensor = (
        to_tensor(c, lib)
@@ -134,10 +162,19 @@ def test(
    for tensor in [a_tensor, b_tensor, c_tensor]:
        tensor.destroyDesc(lib)

+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetSwiGLUWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, c.device)
+
    def lib_swiglu():
        check_error(
            lib.infiniopSwiGLU(
-                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                descriptor, 
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data, a_tensor.data, b_tensor.data, None
            )
        )

@@ -170,10 +207,18 @@ if __name__ == "__main__":
        infiniopTensorDescriptor_t,
    ]

+    lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
+    lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
+        infiniopSwiGLUDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
    lib.infiniopSwiGLU.restype = c_int32
    lib.infiniopSwiGLU.argtypes = [
        infiniopSwiGLUDescriptor_t,
        c_void_p,
+        c_uint64,
+        c_void_p,
        c_void_p,
        c_void_p,
        c_void_p,

--- a/xmake.lua
+++ b/xmake.lua
@@ -4,9 +4,10 @@ local GREEN = '\27[0;32m'
 local YELLOW = '\27[1;33m'
 local NC = '\27[0m'  -- No Color

-add_includedirs("include")
 set_encodings("utf-8")

+add_includedirs("include")
+
 if is_mode("debug") then
    add_defines("DEBUG_MODE")
 end
@@ -117,6 +118,18 @@ if has_config("kunlun-xpu") then
    includes("xmake/kunlun.lua")
 end

+-- InfiniCCL
+option("ccl")
+set_default(false)
+    set_default(false)
+    set_showmenu(true)
+    set_description("Wether to complie implementations for InfiniCCL")
+option_end()
+
+if has_config("ccl") then
+    add_defines("ENABLE_CCL")
+end
+
 target("infini-utils")
    set_kind("static")
    on_install(function (target) end)
@@ -149,6 +162,9 @@ target("infinirt")
    if has_config("nv-gpu") then
        add_deps("infinirt-cuda")
    end
+    if has_config("cambricon-mlu") then
+        add_deps("infinirt-cambricon")
+    end
    if has_config("ascend-npu") then
        add_deps("infinirt-ascend")
    end
@@ -219,10 +235,25 @@ target("infiniop")
    add_installfiles("include/infinicore.h", {prefixdir = "include"})
 target_end()

+target("infiniccl")
+    set_kind("shared")
+    add_deps("infinirt")
+
+    if has_config("nv-gpu") then
+        add_deps("infiniccl-cuda")
+    end
+    
+    set_languages("cxx17")
+
+    add_files("src/infiniccl/*.cc")
+    add_installfiles("include/infiniccl.h", {prefixdir = "include"})
+
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+target_end()

 target("all")
    set_kind("phony")
-    add_deps("infiniop", "infinirt")
+    add_deps("infiniop", "infinirt", "infiniccl")
    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
 target_end()


--- a/xmake/ascend.lua
+++ b/xmake/ascend.lua
@@ -50,9 +50,8 @@ target("infiniop-ascend")
    add_files("$(projectdir)/src/infiniop/devices/ascend/*.cc", "$(projectdir)/src/infiniop/ops/*/ascend/*.cc")

    -- Add operator
-    -- TODO: add it back after ascend-kernels is fixed
-    -- add_rules("ascend-kernels")
-    -- add_links(builddir.."/libascend_kernels.a")
+    add_rules("ascend-kernels")
+    add_links(builddir.."/libascend_kernels.a")
 target_end()

 target("infinirt-ascend")

--- a/xmake/bang.lua
+++ b/xmake/bang.lua
@@ -50,3 +50,13 @@ target("infiniop-cambricon")
        add_files(mlu_files, {rule = "mlu"})
    end
 target_end()
+
+target("infinirt-cambricon")
+    set_kind("static")
+    add_deps("infini-utils")
+    set_languages("cxx17")
+    on_install(function (target) end)
+    -- Add include dirs
+    add_files("../src/infinirt/bang/*.cc")
+    add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+target_end()
--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
@@ -28,6 +28,7 @@ target("infiniop-cuda")
    else
        add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
        add_cuflags("-Xcompiler=-fPIC")
+        add_cuflags("--extended-lambda")
        add_culdflags("-Xcompiler=-fPIC")
        add_cxxflags("-fPIC")
    end
@@ -57,3 +58,34 @@ target("infinirt-cuda")
    set_languages("cxx17")
    add_files("../src/infinirt/cuda/*.cu")
 target_end()
+
+target("infiniccl-cuda")
+    set_kind("static")
+    add_deps("infinirt")
+    on_install(function (target) end)
+    if has_config("ccl") then
+        set_policy("build.cuda.devlink", true)
+        set_toolchains("cuda")
+        add_links("cudart")
+
+        if not is_plat("windows") then
+            add_cuflags("-Xcompiler=-fPIC")
+            add_culdflags("-Xcompiler=-fPIC")
+            add_cxflags("-fPIC")
+
+            local nccl_root = os.getenv("NCCL_ROOT")
+            if nccl_root then
+                add_includedirs(nccl_root .. "/include")
+                add_links(nccl_root .. "/lib/libnccl.so")
+            else
+                add_links("nccl") -- Fall back to default nccl linking
+            end
+
+            add_files("../src/infiniccl/cuda/*.cu")
+        else
+            print("[Warning] NCCL is not supported on Windows")
+        end
+    end
+    set_languages("cxx17")
+    
+target_end()
--- a/xmake/kunlun.lua
+++ b/xmake/kunlun.lua
 add_defines("ENABLE_KUNLUN_API")
 local KUNLUN_HOME = os.getenv("KUNLUN_HOME")
+local XTDK_DIR = path.join(KUNLUN_HOME, "XTDK")

 -- Add include dirs
 add_includedirs(path.join(KUNLUN_HOME, "include"), {public=true})
@@ -7,6 +8,55 @@ add_linkdirs(path.join(KUNLUN_HOME, "lib64"))
 add_links("xpurt")
 add_links("xpuapi")

+rule("xpu")
+    set_extensions(".xpu")
+    
+    on_load(function (target)
+        target:add("includedirs", path.join(os.projectdir(), "include"))
+    end)
+
+    on_build_file(function (target, sourcefile)
+
+        local objectfile = target:objectfile(sourcefile)
+        local basename = objectfile:gsub("%.o$", "")
+        os.mkdir(path.directory(objectfile))
+        local cc = path.join(XTDK_DIR, "bin/clang++")
+        local includedirs = table.concat(target:get("includedirs"), " ")
+        local arch_map = {
+            ["x86_64"] = "x86_64-linux-gnu",
+            ["arm64"] = "aarch64-linux-gnu"
+        }
+
+
+        local args = {
+            "--sysroot=/",
+            "--target=" .. arch_map[os.arch()],
+            "-fPIC",
+            "-pie",
+            "--xpu-arch=xpu2",
+            "--basename", basename,
+            "-std=c++11",
+            "-O2",
+            "-fno-builtin",
+            "-g",
+            "-c", sourcefile,
+            "-v"
+        }
+        
+        for _, includedir in ipairs(target:get("includedirs")) do
+            table.insert(args, "-I" .. includedir)
+        end
+
+        -- print(args)
+        os.execv(cc, args)
+        table.insert(target:objectfiles(), objectfile)
+        table.insert(target:objectfiles(), basename .. ".device.bin.o")
+        print(target:objectfiles())
+    end)
+rule_end()
+
+local src_dir = path.join(os.projectdir(), "src", "infiniop")
+
 target("infiniop-kunlun")
    set_kind("static")
    add_deps("infini-utils")
@@ -17,6 +67,11 @@ target("infiniop-kunlun")

    set_languages("cxx17")
    add_files("$(projectdir)/src/infiniop/devices/kunlun/*.cc", "$(projectdir)/src/infiniop/ops/*/kunlun/*.cc")
+    -- compile handwriting kernel
+    local xpu_files = os.files(src_dir .. "/ops/*/kunlun/*.xpu")
+    if #xpu_files > 0 then
+        add_files(xpu_files, {rule = "xpu"})
+    end
 target_end()

 target("infinirt-kunlun")

--- a/xmake/maca.lua
+++ b/xmake/maca.lua

 local MACA_ROOT = os.getenv("MACA_PATH") or os.getenv("MACA_HOME") or os.getenv("MACA_ROOT")
-
 add_includedirs(MACA_ROOT .. "/include")
 add_linkdirs(MACA_ROOT .. "/lib")
-add_links("libhcdnn.so")
-add_links("libhcblas.so")
-add_links("libhcruntime.so")
+add_links("hcdnn", "hcblas", "hcruntime")

 rule("maca")
    set_extensions(".maca")
@@ -34,13 +31,11 @@ rule_end()
 target("infiniop-metax")
    set_kind("static")
    on_install(function (target) end)
-    add_cxflags("-lstdc++ -Wall -fPIC")
    set_languages("cxx17")
-    set_warnings("all")
-
+    set_warnings("all", "error")
+    add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
    add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
    add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
-
 target_end()

 target("infinirt-metax")
@@ -48,7 +43,7 @@ target("infinirt-metax")
    set_languages("cxx17")
    on_install(function (target) end)
    add_deps("infini-utils")
-    -- Add files
-    add_files("$(projectdir)/src/infinirt/maca/*.cc")
-    add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+    set_warnings("all", "error")
+    add_cxflags("-lstdc++ -fPIC")
+    add_files("../src/infinirt/maca/*.cc")
 target_end()
--- a/xmake/test.lua
+++ b/xmake/test.lua
@@ -34,3 +34,20 @@ target("infiniop-test")

    set_installdir(INFINI_ROOT)
 target_end()
+
+target("infiniccl-test")
+    set_kind("binary")
+    add_deps("infini-utils")
+    set_default(false)
+
+    set_warnings("all", "error")
+    set_languages("cxx17")
+
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+    add_includedirs(INFINI_ROOT.."/include")
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infinirt", "infiniccl")
+    add_files(os.projectdir().."/src/infiniccl-test/*.cpp")
+
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+target_end()