Merge branch 'main' into issue/150

4e4d3415 · Catheriany · GitHub · d1c46889 · 1a4cfb99 · 4e4d3415
Unverified Commit 4e4d3415 authored Apr 29, 2025 by Catheriany Committed by GitHub Apr 29, 2025
11 changed files
--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
@@ -44,46 +44,46 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
    return INFINI_STATUS_SUCCESS;
 }
-#define INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, ACTION)   \
+#define INFINIRT_CALL_DEVICE_API_AND(DEVICE_TYPE, API, PARAMS, ACTION) \
-    {                                                       \
+    {                                                                  \
-        infiniStatus_t _status;                             \
+        infiniStatus_t _status;                                        \
-        switch (CURRENT_DEVICE_TYPE) {                      \
+        switch (DEVICE_TYPE) {                                         \
-        case INFINI_DEVICE_CPU:                             \
+        case INFINI_DEVICE_CPU:                                        \
-            _status = infinirt::cpu::API PARAMS;            \
+            _status = infinirt::cpu::API PARAMS;                       \
-            break;                                          \
+            break;                                                     \
-        case INFINI_DEVICE_NVIDIA:                          \
+        case INFINI_DEVICE_NVIDIA:                                     \
-            _status = infinirt::cuda::API PARAMS;           \
+            _status = infinirt::cuda::API PARAMS;                      \
-            break;                                          \
+            break;                                                     \
-        case INFINI_DEVICE_CAMBRICON:                       \
+        case INFINI_DEVICE_CAMBRICON:                                  \
-            _status = infinirt::bang::API PARAMS;           \
+            _status = infinirt::bang::API PARAMS;                      \
-            break;                                          \
+            break;                                                     \
-        case INFINI_DEVICE_ASCEND:                          \
+        case INFINI_DEVICE_ASCEND:                                     \
-            _status = infinirt::ascend::API PARAMS;         \
+            _status = infinirt::ascend::API PARAMS;                    \
-            break;                                          \
+            break;                                                     \
-        case INFINI_DEVICE_METAX:                           \
+        case INFINI_DEVICE_METAX:                                      \
-            _status = infinirt::maca::API PARAMS;           \
+            _status = infinirt::maca::API PARAMS;                      \
-            break;                                          \
+            break;                                                     \
-        case INFINI_DEVICE_MOORE:                           \
+        case INFINI_DEVICE_MOORE:                                      \
-            _status = infinirt::musa::API PARAMS;           \
+            _status = infinirt::musa::API PARAMS;                      \
-            break;                                          \
+            break;                                                     \
-        default:                                            \
+        default:                                                       \
-            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;            \
-        }                                                   \
+        }                                                              \
-        { ACTION; }                                         \
+        { ACTION; }                                                    \
-        return _status;                                     \
+        return _status;                                                \
    }
-#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, )
+#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(CURRENT_DEVICE_TYPE, API, PARAMS, )
 __C infiniStatus_t infinirtGetDeviceCount(infiniDevice_t device, int *count) {
    if (count == nullptr) {
        return INFINI_STATUS_NULL_POINTER;
    }
-    INFINIRT_CALL_DEVICE_API(getDeviceCount, (count));
+    INFINIRT_CALL_DEVICE_API_AND(device, getDeviceCount, (count), {});
 }
 __C infiniStatus_t infinirtSetDevice(infiniDevice_t device, int device_id) {
-    INFINIRT_CALL_DEVICE_API_AND(setDevice, (device_id),
+    INFINIRT_CALL_DEVICE_API_AND(device, setDevice, (device_id),
                                 { CURRENT_DEVICE_TYPE = device;
                                   CURRENT_DEVICE_ID = device_id; });
 }

--- a/src/utils/rearrange.cc
+++ b/src/utils/rearrange.cc
@@ -138,4 +138,73 @@ void rearrange(
    }
 }
+utils::Result<RearrangeMeta> RearrangeMeta::distributeUnit(const std::vector<size_t> &candidates) const {
+    // 获取当前的unit大小
+    size_t current_unit = _meta[0];
+    // 寻找满足条件的unit值：当前unit能被其整除
+    size_t new_unit = 0;
+    for (size_t candidate : candidates) {
+        if (current_unit % candidate == 0) {
+            new_unit = candidate;
+            break;
+        }
+    }
+    // 如果没找到合适的值，返回错误
+    if (new_unit == 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    // 如果找到的值就是当前unit，返回自身的副本
+    if (new_unit == current_unit) {
+        return Result<RearrangeMeta>(_meta);
+    }
+    // 获取当前维度
+    size_t ndim_value = this->ndim();
+    // 创建新的布局数组
+    std::vector<ptrdiff_t> layout(2 + (ndim_value + 1) * 3, 0);
+    // 设置新的unit值
+    layout[0] = new_unit;
+    // 计算扩展因子
+    ptrdiff_t extra = current_unit / new_unit;
+    // 计算步长指针的偏移量
+    ptrdiff_t idx_offset = 1;
+    // 在新布局中设置相应的指针
+    ptrdiff_t *new_idx = layout.data() + 1;
+    ptrdiff_t *new_dst = layout.data() + 2 + (ndim_value + 1);
+    ptrdiff_t *new_src = layout.data() + 2 + (ndim_value + 1) * 2;
+    // 复制并调整索引步长
+    // 索引步长需要重新计算
+    // 首先复制原来的索引步长
+    for (size_t i = 0; i < ndim_value + 1; ++i) {
+        new_idx[i] = _meta[idx_offset + i] * extra;
+    }
+    // 设置最后一个维度的步长为1
+    new_idx[ndim_value + 1] = 1;
+    // 复制目标步长数据，并添加新单元大小
+    for (size_t i = 0; i < ndim_value; ++i) {
+        new_dst[i] = dst_strides()[i];
+    }
+    new_dst[ndim_value] = new_unit;
+    // 复制源步长数据，并添加新单元大小
+    for (size_t i = 0; i < ndim_value; ++i) {
+        new_src[i] = src_strides()[i];
+    }
+    new_src[ndim_value] = new_unit;
+    return Result<RearrangeMeta>(layout);
+}
 } // namespace utils
--- a/src/utils/rearrange.h
+++ b/src/utils/rearrange.h
@@ -28,6 +28,9 @@ public:
    const ptrdiff_t *src_strides() const;
    void launch(void *dst, const void *src) const;
+    // 拆分 unit 到更小的规模以利于并行
+    utils::Result<RearrangeMeta> distributeUnit(const std::vector<size_t> &candidates) const;
 };
 void rearrange(

--- a/test/infiniop-test/test_generate/testcases/mul.py
+++ b/test/infiniop-test/test_generate/testcases/mul.py
+import numpy as np
+import gguf
+from typing import List
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+def mul(
+    a: np.ndarray,
+    b: np.ndarray
+):
+    return np.multiply(a, b)
+def random_tensor(shape, dtype):
+    rate = 1e-3
+    var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
+    return rate * np.random.rand(*shape).astype(dtype) - var
+class MulTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        stride_c: List[int] | None,
+    ):
+        super().__init__("mul")
+        self.a = a
+        self.stride_a = stride_a
+        self.b = b
+        self.stride_b = stride_b
+        self.c = c
+        self.stride_c = stride_c
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
+        if self.stride_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        a_fp64 = self.a.astype(np.float64)
+        b_fp64 = self.b.astype(np.float64)
+        ans_fp64 = np.multiply(a_fp64, b_fp64)
+        ans = mul(self.a, self.b)
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans_fp64"),
+            ans_fp64,
+            raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
+        )
+if __name__ == '__main__':
+    test_writer = InfiniopTestWriter("mul.gguf")
+    test_cases = [
+        MulTestCase(
+            random_tensor((2, 3), np.float32),
+            gguf_strides(3, 1),  
+            random_tensor((2, 3), np.float32),
+            gguf_strides(1, 2),  
+            random_tensor((2, 3), np.float32),
+            gguf_strides(3, 1),  
+        ),
+        MulTestCase(
+            random_tensor((2, 3), np.float16),
+            gguf_strides(1, 2),  
+            random_tensor((2, 3), np.float16),
+            gguf_strides(3, 1), 
+            random_tensor((2, 3), np.float16),
+            gguf_strides(1, 2),  
+        ),
+        MulTestCase(
+            random_tensor((2, 3), np.float64),
+            gguf_strides(3, 1),  
+            random_tensor((2, 3), np.float64),
+            gguf_strides(3, 1),  
+            random_tensor((2, 3), np.float64),
+            gguf_strides(1, 2),  
+        ),
+        MulTestCase(
+            random_tensor((4, 6), np.float16),
+            gguf_strides(1, 4),  
+            random_tensor((4, 6), np.float16),
+            gguf_strides(1, 5),  
+            random_tensor((4, 6), np.float16),
+            gguf_strides(6, 1),  
+        ),
+        MulTestCase(
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 1),  
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(2048, 1),  
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 1),  
+        ),
+        MulTestCase(
+            random_tensor((2048, 2048), np.float32),
+            None,  
+            random_tensor((2048, 2048), np.float32),
+            gguf_strides(1, 2048),  
+            random_tensor((2048, 2048), np.float32),
+            None,  
+        ),
+        MulTestCase(
+            random_tensor((2, 4, 2048), np.float16),
+            gguf_strides(4 * 2048, 2048, 1),  
+            random_tensor((2, 4, 2048), np.float16),
+            gguf_strides(1, 2, 2 * 4),  
+            random_tensor((2, 4, 2048), np.float16),
+            gguf_strides(4 * 2048, 2048, 1),  
+        ),
+        MulTestCase(
+            random_tensor((2, 4, 2048), np.float32),
+            gguf_strides(1, 2, 2 * 4),  
+            random_tensor((2, 4, 2048), np.float32),
+            None,  
+            random_tensor((2, 4, 2048), np.float32),
+            gguf_strides(1, 2, 2 * 4),  
+        ),
+        MulTestCase(
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(2560, 1),  
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(1, 2048),  
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(2560, 1),  
+        ),
+        MulTestCase(
+            random_tensor((4, 48, 64), np.float16),
+            gguf_strides(64 * 48, 64, 1),  
+            random_tensor((4, 48, 64), np.float16),
+            gguf_strides(1, 4, 4 * 48),  
+            random_tensor((4, 48, 64), np.float16),
+            None  
+        ),
+        MulTestCase(
+            random_tensor((4, 48, 64), np.float32),
+            None,  
+            random_tensor((4, 48, 64), np.float32),
+            gguf_strides(1, 4, 4 * 48),  
+            random_tensor((4, 48, 64), np.float32),
+            gguf_strides(48 * 64, 64, 1),  
+        )
+    ]
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
-from ctypes import POINTER, Structure, c_int32, c_void_p
+import torch
 import ctypes
-import sys
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
-import os
+from libinfiniop import (
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
-    create_handle,
+    open_lib,
-    destroy_handle,
+    to_tensor,
+    get_test_devices,
    check_error,
+    rearrange_if_needed,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    create_workspace,
 )
-from operatorspy.tests.test_utils import get_args
 from enum import Enum, auto
-import torch
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
 class Inplace(Enum):
@@ -26,6 +43,35 @@ class Inplace(Enum):
    INPLACE_B = auto()
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 class AddDescriptor(Structure):
    _fields_ = [("device", c_int32)]
@@ -37,42 +83,71 @@ def add(x, y):
    return torch.add(x, y)
+def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
+    """
+    rearrange the tensors if needed and apply the inplace config.
+    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
+    the inplace config is ignored and out-of-place is used
+    """
+    original_c_strides = c_strides if c_strides else c.stride()
+    def _rearrange(tensor, strides):
+        if strides and 0 in strides:
+            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
+            return tensor
+        else:
+            return rearrange_if_needed(tensor, strides)
+    a, b, c = [
+        _rearrange(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
+    ]
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
+    if 0 in c.stride():
+        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
+    return a, b, c
 def test(
    lib,
    handle,
    torch_device,
-    c_shape,
+    shape,
-    a_shape,
+    a_stride=None,
-    b_shape,
+    b_stride=None,
-    tensor_dtype=torch.float16,
+    c_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
 ):
    print(
-        f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+        f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{dtype} inplace:{inplace}"
    )
-    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
-        print("Unsupported test: broadcasting does not support in-place")
-        return
-    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = (
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
-        torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
+    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
    ans = add(a, b)
-    a_tensor = to_tensor(a, lib)
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-    b_tensor = to_tensor(b, lib)
    c_tensor = (
        to_tensor(c, lib)
        if inplace == Inplace.OUT_OF_PLACE
        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
    )
-    descriptor = infiniopAddDescriptor_t()
+    if sync is not None:
+        sync()
+    descriptor = infiniopAddDescriptor_t()
    check_error(
        lib.infiniopCreateAddDescriptor(
            handle,
@@ -84,74 +159,48 @@ def test(
    )
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    c_tensor.descriptor.contents.invalidate()
+    for tensor in [a_tensor, b_tensor, c_tensor]:
-    a_tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)
-    b_tensor.descriptor.contents.invalidate()
+    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+        lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
    )
-    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+    workspace = create_workspace(workspace_size.value, c.device)
-    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
+    def lib_add():
+        check_error(
-def test_cpu(lib, test_cases):
+            lib.infiniopAdd(
-    device = DeviceEnum.DEVICE_CPU
+                descriptor,
-    handle = create_handle(lib, device)
+                workspace.data_ptr() if workspace is not None else None,
-    for c_shape, a_shape, b_shape, inplace in test_cases:
+                workspace_size.value,
-        # fmt: off
+                c_tensor.data,
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+                a_tensor.data,
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+                b_tensor.data,
-        # fmt: on
+                None,
-    destroy_handle(lib, handle)
+            )
+        )
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for c_shape, a_shape, b_shape, inplace in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
+    lib_add()
-def test_bang(lib, test_cases):
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    import torch_mlu
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
-    device = DeviceEnum.DEVICE_BANG
+    # Profiling workflow
-    handle = create_handle(lib, device)
+    if PROFILE:
-    for c_shape, a_shape, b_shape, inplace in test_cases:
        # fmt: off
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        profile_operation("PyTorch", lambda: add(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        profile_operation("    lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    destroy_handle(lib, handle)
+    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
 if __name__ == "__main__":
-    test_cases = [
-        # fmt: off
-        # c_shape, a_shape, b_shape, inplace
-        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
-        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
-        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
-        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
-        ((), (), (), Inplace.OUT_OF_PLACE),
-        ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
-        ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
-        ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
-        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
-        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
-        # fmt: on
-    ]
    args = get_args()
    lib = open_lib()
    lib.infiniopCreateAddDescriptor.restype = c_int32
    lib.infiniopCreateAddDescriptor.argtypes = [
        infiniopHandle_t,
@@ -160,25 +209,36 @@ if __name__ == "__main__":
        infiniopTensorDescriptor_t,
        infiniopTensorDescriptor_t,
    ]
+    lib.infiniopGetAddWorkspaceSize.restype = c_int32
+    lib.infiniopGetAddWorkspaceSize.argtypes = [
+        infiniopAddDescriptor_t,
+        POINTER(c_uint64),
+    ]
    lib.infiniopAdd.restype = c_int32
    lib.infiniopAdd.argtypes = [
        infiniopAddDescriptor_t,
        c_void_p,
+        c_uint64,
+        c_void_p,
        c_void_p,
        c_void_p,
        c_void_p,
    ]
    lib.infiniopDestroyAddDescriptor.restype = c_int32
    lib.infiniopDestroyAddDescriptor.argtypes = [
        infiniopAddDescriptor_t,
    ]
-    if args.cpu:
+    # Configure testing options
-        test_cpu(lib, test_cases)
+    DEBUG = args.debug
-    if args.cuda:
+    PROFILE = args.profile
-        test_cuda(lib, test_cases)
+    NUM_PRERUN = args.num_prerun
-    if args.bang:
+    NUM_ITERATIONS = args.num_iterations
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
+    for device in get_test_devices(args):
-        test_cpu(lib, test_cases)
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/mul.py
+++ b/test/infiniop/mul.py
+import torch
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from libinfiniop import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    open_lib,
+    to_tensor,
+    get_test_devices,
+    check_error,
+    rearrange_if_needed,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    create_workspace,
+)
+from enum import Enum, auto
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+class MulDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+infiniopMulDescriptor_t = POINTER(MulDescriptor)
+def mul(x, y):
+    return torch.mul(x, y)
+def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
+    """
+    rearrange the tensors if needed and apply the inplace config.
+    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
+    the inplace config is ignored and out-of-place is used
+    """
+    original_c_strides = c_strides if c_strides else c.stride()
+    def _rearrange(tensor, strides):
+        if strides and 0 in strides:
+            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
+            return tensor
+        else:
+            return rearrange_if_needed(tensor, strides)
+    a, b, c = [
+        _rearrange(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
+    ]
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
+    if 0 in c.stride():
+        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
+    return a, b, c
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{dtype} inplace:{inplace}"
+    )
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
+    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
+    ans = mul(a, b)
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
+    if sync is not None:
+        sync()
+    descriptor = infiniopMulDescriptor_t()
+    check_error(
+        lib.infiniopCreateMulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a_tensor, b_tensor, c_tensor]:
+        tensor.destroyDesc(lib)
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, c.device)
+    def lib_mul():
+        check_error(
+            lib.infiniopMul(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                None,
+            )
+        )
+    lib_mul()
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(lib.infiniopDestroyMulDescriptor(descriptor))
+if __name__ == "__main__":
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMulDescriptor.restype = c_int32
+    lib.infiniopCreateMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMulDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetMulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMulWorkspaceSize.argtypes = [
+        infiniopMulDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopMul.restype = c_int32
+    lib.infiniopMul.argtypes = [
+        infiniopMulDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMulDescriptor.restype = c_int32
+    lib.infiniopDestroyMulDescriptor.argtypes = [
+        infiniopMulDescriptor_t,
+    ]
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -17,19 +17,88 @@ from libinfiniop import (
    profile_operation,
 )
+def row_major_strides(shape):
+    """生成张量的行优先(C风格)stride
+    Args:
+        shape: 张量形状
+    Returns:
+        行优先strides列表
+    """
+    # 行优先 (C风格，从最后一维到第一维)
+    stride = 1
+    strides = [1]
+    for dim in reversed(shape[1:]):
+        stride *= dim
+        strides.insert(0, stride)
+    return strides
+def column_major_strides(shape):
+    """生成张量的列优先(Fortran风格)stride
+    Args:
+        shape: 张量形状
+    Returns:
+        列优先strides列表
+    """
+    # 列优先 (Fortran风格，从第一维到最后一维)
+    stride = 1
+    strides = [stride]
+    for dim in shape[:-1]:
+        stride *= dim
+        strides.append(stride)
+    return strides
 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES = [
-    # ((src_shape, src_stride), (dst_shape, dst_stride))
+    # (shape, x_stride, y_stride)
-    (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
+    (
-    (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
+        (2, 4, 64),  # shape
-    (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
+        (2, 4, 8),   # x_stride
-    (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
+        (512, 128, 2) # y_stride
-    (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
+    ),
-    (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
+    (
-    (((64,), (1,)), ((64,), (1,))),
+        (100, 100),  # shape
+        (1, 100),    # x_stride
+        (100, 1)     # y_stride
+    ),
+    (
+        (4, 4),      # shape
+        (1, 4),      # x_stride
+        (4, 1)       # y_stride
+    ),
+    (
+        (4, 6, 64),  # shape
+        (64, 4*64, 1), # x_stride
+        (6*64, 64, 1)  # y_stride
+    ),
+    (
+        (2000, 2000), # shape
+        (1, 2000),    # x_stride
+        (2000, 1)     # y_stride
+    ),
+    (
+        (2001, 2001), # shape
+        (1, 2001),    # x_stride
+        (2001, 1)     # y_stride
+    ),
+    (
+        (3, 4, 7, 53, 9), # shape
+        row_major_strides((3, 4, 7, 53, 9)), # x_stride
+        column_major_strides((3, 4, 7, 53, 9)) # y_stride
+    ),
+    (
+        (3, 4, 50, 50, 5, 7), # shape
+        row_major_strides((3, 4, 50, 50, 5, 7)),  # x_stride
+        column_major_strides((3, 4, 50, 50, 5, 7)) # y_stride
+    ),
 ]
 # Data types used for testing
@@ -58,23 +127,23 @@ def test(
    lib,
    handle,
    torch_device,
-    x_shape,
+    shape,
    x_stride,
-    y_shape,
    y_stride,
    dtype=torch.float16,
 ):
    print(
-        f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} dtype:{dtype}"
+        f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
    )
-    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
+    x = torch.rand(shape, dtype=dtype).to(torch_device)
-    y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
+    y = torch.zeros(shape, dtype=dtype).to(torch_device)
    x, y = [
        rearrange_if_needed(tensor, stride)
        for tensor, stride in zip([x, y], [x_stride, y_stride])
    ]
    x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
    descriptor = infiniopRearrangeDescriptor_t()
@@ -86,7 +155,7 @@ def test(
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
    for tensor in [x_tensor, y_tensor]:
-        tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)
    def lib_rearrange():
        check_error(

--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -61,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 2e-7, "rtol": 1e-7},
 }
 DEBUG = False

--- a/xmake.lua
+++ b/xmake.lua
@@ -118,6 +118,18 @@ if has_config("kunlun-xpu") then
    includes("xmake/kunlun.lua")
 end
+-- InfiniCCL
+option("ccl")
+set_default(false)
+    set_default(false)
+    set_showmenu(true)
+    set_description("Wether to complie implementations for InfiniCCL")
+option_end()
+if has_config("ccl") then
+    add_defines("ENABLE_CCL")
+end
 target("infini-utils")
    set_kind("static")
    on_install(function (target) end)
@@ -220,10 +232,25 @@ target("infiniop")
    add_installfiles("include/infinicore.h", {prefixdir = "include"})
 target_end()
+target("infiniccl")
+    set_kind("shared")
+    add_deps("infinirt")
+    if has_config("nv-gpu") then
+        add_deps("infiniccl-cuda")
+    end
+    set_languages("cxx17")
+    add_files("src/infiniccl/*.cc")
+    add_installfiles("include/infiniccl.h", {prefixdir = "include"})
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+target_end()
 target("all")
    set_kind("phony")
-    add_deps("infiniop", "infinirt")
+    add_deps("infiniop", "infinirt", "infiniccl")
    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
 target_end()

--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
@@ -58,3 +58,34 @@ target("infinirt-cuda")
    set_languages("cxx17")
    add_files("../src/infinirt/cuda/*.cu")
 target_end()
+target("infiniccl-cuda")
+    set_kind("static")
+    add_deps("infinirt")
+    on_install(function (target) end)
+    if has_config("ccl") then
+        set_policy("build.cuda.devlink", true)
+        set_toolchains("cuda")
+        add_links("cudart")
+        if not is_plat("windows") then
+            add_cuflags("-Xcompiler=-fPIC")
+            add_culdflags("-Xcompiler=-fPIC")
+            add_cxflags("-fPIC")
+            local nccl_root = os.getenv("NCCL_ROOT")
+            if nccl_root then
+                add_includedirs(nccl_root .. "/include")
+                add_links(nccl_root .. "/lib/libnccl.so")
+            else
+                add_links("nccl") -- Fall back to default nccl linking
+            end
+            add_files("../src/infiniccl/cuda/*.cu")
+        else
+            print("[Warning] NCCL is not supported on Windows")
+        end
+    end
+    set_languages("cxx17")
+target_end()
--- a/xmake/test.lua
+++ b/xmake/test.lua
@@ -34,3 +34,20 @@ target("infiniop-test")
    set_installdir(INFINI_ROOT)
 target_end()
+target("infiniccl-test")
+    set_kind("binary")
+    add_deps("infini-utils")
+    set_default(false)
+    set_warnings("all", "error")
+    set_languages("cxx17")
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+    add_includedirs(INFINI_ROOT.."/include")
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infinirt", "infiniccl")
+    add_files(os.projectdir().."/src/infiniccl-test/*.cpp")
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+target_end()