Merge remote-tracking branch 'origin/main' into issue/142

c2e87202 · Catheriany · 41818f84 · c203635b · c2e87202 · c2e87202
Commit c2e87202 authored Jun 04, 2025 by Catheriany
20 changed files
--- a/src/utils/custom_types.cc
+++ b/src/utils/custom_types.cc
@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) {
    int32_t exponent = ((f32 >> 23) & 0xFF) - 127; // Extract and de-bias the exponent
    uint32_t mantissa = f32 & 0x7FFFFF;            // Extract the mantissa (fraction part)

-    if (exponent >= 31) { // Special cases for Inf and NaN
+    if (exponent >= 16) { // Special cases for Inf and NaN
        // NaN
        if (exponent == 128 && mantissa != 0) {
            return fp16_t{static_cast<uint16_t>(sign | 0x7E00)};

--- a/src/utils/rearrange.cc
+++ b/src/utils/rearrange.cc
@@ -138,4 +138,73 @@ void rearrange(
    }
 }

+utils::Result<RearrangeMeta> RearrangeMeta::distributeUnit(const std::vector<size_t> &candidates) const {
+    // 获取当前的unit大小
+    size_t current_unit = _meta[0];
+
+    // 寻找满足条件的unit值：当前unit能被其整除
+    size_t new_unit = 0;
+    for (size_t candidate : candidates) {
+        if (current_unit % candidate == 0) {
+            new_unit = candidate;
+            break;
+        }
+    }
+
+    // 如果没找到合适的值，返回错误
+    if (new_unit == 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // 如果找到的值就是当前unit，返回自身的副本
+    if (new_unit == current_unit) {
+        return Result<RearrangeMeta>(_meta);
+    }
+
+    // 获取当前维度
+    size_t ndim_value = this->ndim();
+
+    // 创建新的布局数组
+    std::vector<ptrdiff_t> layout(2 + (ndim_value + 1) * 3, 0);
+
+    // 设置新的unit值
+    layout[0] = new_unit;
+
+    // 计算扩展因子
+    ptrdiff_t extra = current_unit / new_unit;
+
+    // 计算步长指针的偏移量
+    ptrdiff_t idx_offset = 1;
+
+    // 在新布局中设置相应的指针
+    ptrdiff_t *new_idx = layout.data() + 1;
+    ptrdiff_t *new_dst = layout.data() + 2 + (ndim_value + 1);
+    ptrdiff_t *new_src = layout.data() + 2 + (ndim_value + 1) * 2;
+
+    // 复制并调整索引步长
+
+    // 索引步长需要重新计算
+    // 首先复制原来的索引步长
+    for (size_t i = 0; i < ndim_value + 1; ++i) {
+        new_idx[i] = _meta[idx_offset + i] * extra;
+    }
+
+    // 设置最后一个维度的步长为1
+    new_idx[ndim_value + 1] = 1;
+
+    // 复制目标步长数据，并添加新单元大小
+    for (size_t i = 0; i < ndim_value; ++i) {
+        new_dst[i] = dst_strides()[i];
+    }
+    new_dst[ndim_value] = new_unit;
+
+    // 复制源步长数据，并添加新单元大小
+    for (size_t i = 0; i < ndim_value; ++i) {
+        new_src[i] = src_strides()[i];
+    }
+    new_src[ndim_value] = new_unit;
+
+    return Result<RearrangeMeta>(layout);
+}
+
 } // namespace utils
--- a/src/utils/rearrange.h
+++ b/src/utils/rearrange.h
@@ -28,6 +28,9 @@ public:
    const ptrdiff_t *src_strides() const;

    void launch(void *dst, const void *src) const;
+
+    // 拆分 unit 到更小的规模以利于并行
+    utils::Result<RearrangeMeta> distributeUnit(const std::vector<size_t> &candidates) const;
 };

 void rearrange(

--- a/test/infiniop-test/test_generate/__init__.py
+++ b/test/infiniop-test/test_generate/__init__.py
-from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides
+from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
--- a/test/infiniop-test/test_generate/infiniop_test.py
+++ b/test/infiniop-test/test_generate/infiniop_test.py
@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None:
    return list(args)[::-1] if args else None


+def contiguous_gguf_strides(shape: tuple[int, ...]) -> list[int]:
+    strides = []
+    acc = 1
+    for size in reversed(shape):
+        strides.append(acc)
+        acc *= size
+    return strides[::-1]
+
 class InfiniopTestCase:
    op_name: str


--- a/test/infiniop-test/test_generate/testcases/add.py
+++ b/test/infiniop-test/test_generate/testcases/add.py
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+
+def add(
+    a: np.ndarray,
+    b: np.ndarray,
+):
+    return a + b
+
+def process_tensor(a, b, stride_a=None, stride_b=None):
+    def normalize_stride(tensor, stride):
+        if stride:
+            slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
+            return tensor[slices]
+        else:
+            return tensor
+
+    a_unique = normalize_stride(a, stride_a)
+    b_unique = normalize_stride(b, stride_b)
+    return a_unique, b_unique
+
+class AddTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        shape_a: List[int] | None,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        shape_b: List[int] | None,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        shape_c: List[int] | None,
+        stride_c: List[int] | None,
+
+    ):
+        super().__init__("add")
+        self.a = a
+        self.shape_a = shape_a
+        self.stride_a = stride_a
+        self.b = b
+        self.shape_b = shape_b
+        self.stride_b = stride_b
+        self.c = c
+        self.shape_c = shape_c
+        self.stride_c = stride_c
+
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.shape_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
+        if self.shape_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
+        if self.shape_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+        test_writer.add_array(
+            test_writer.gguf_key("c.strides"),
+            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        ans = add(
+            self.a.astype(np.float64),
+            self.b.astype(np.float64),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("add.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # shape, a_stride, b_stride, c_stride
+        ((13, 4), None, None, None),
+        ((13, 4), (10, 1), (10, 1), (10, 1)),
+        ((13, 4), (0, 1), None, None),
+        ((13, 4, 4), None, None, None),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+        ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+        ((16, 5632), None, None, None),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+        ((4, 4, 5632), None, None, None),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+    for dtype in _TENSOR_DTYPES_:
+        for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
+            a = np.random.rand(*shape).astype(dtype)
+            b = np.random.rand(*shape).astype(dtype)
+            c = np.empty(tuple(0 for _ in shape), dtype=dtype)
+            a, b = process_tensor(a, b, stride_a, stride_b)
+            if stride_c is None:
+                stride_c = contiguous_gguf_strides(shape)
+            test_case = AddTestCase(
+                a=a,
+                shape_a=shape,
+                stride_a=stride_a,
+                b=b,
+                shape_b=shape,
+                stride_b=stride_b,
+                c=c,
+                shape_c=shape,
+                stride_c=stride_c,
+            )
+            test_cases.append(test_case)
+            
+    test_writer.add_tests(test_cases)
+    test_writer.save()
+    
\ No newline at end of file
--- a/test/infiniop-test/test_generate/testcases/clip.py
+++ b/test/infiniop-test/test_generate/testcases/clip.py
+import numpy as np
+import gguf
+from typing import List, Optional, Tuple
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+
+
+def clip(
+    x: np.ndarray,
+    min_val: np.ndarray,
+    max_val: np.ndarray,
+) -> np.ndarray:
+    """
+    Clip the values in input tensor x to the range [min_val, max_val].
+
+    Args:
+        x: Input tensor
+        min_val: Tensor with minimum values (same shape as x)
+        max_val: Tensor with maximum values (same shape as x)
+
+    Returns:
+        Clipped tensor with the same shape as x
+    """
+    return np.maximum(np.minimum(x, max_val), min_val)
+
+
+def random_tensor(shape, dtype):
+    """
+    Generate a random tensor with values in the range [-2, 2].
+
+    Args:
+        shape: Shape of the tensor
+        dtype: Data type of the tensor
+
+    Returns:
+        Random tensor with the specified shape and dtype
+    """
+    return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
+
+
+class ClipTestCase(InfiniopTestCase):
+    """
+    Test case for the Clip operator.
+    """
+
+    def __init__(
+        self,
+        x: np.ndarray,
+        x_stride: Optional[List[int]],
+        min_val: np.ndarray,
+        min_stride: Optional[List[int]],
+        max_val: np.ndarray,
+        max_stride: Optional[List[int]],
+        y: np.ndarray,
+        y_stride: Optional[List[int]],
+    ):
+        super().__init__("clip")
+        self.x = x
+        self.x_stride = x_stride
+        self.min_val = min_val
+        self.min_stride = min_stride
+        self.max_val = max_val
+        self.max_stride = max_stride
+        self.y = y
+        self.y_stride = y_stride
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+
+        # Add strides as arrays if they exist
+        if self.x_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), self.x_stride)
+        if self.min_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("min_val.strides"), self.min_stride)
+        if self.max_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("max_val.strides"), self.max_stride)
+        if self.y_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("y.strides"), self.y_stride)
+
+        # Add tensors to the test
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"),
+            self.x,
+            raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("min_val"),
+            self.min_val,
+            raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("max_val"),
+            self.max_val,
+            raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"),
+            self.y,
+            raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+
+        # Calculate the expected result
+        ans = clip(
+            self.x.astype(np.float64),
+            self.min_val.astype(np.float64),
+            self.max_val.astype(np.float64)
+        )
+
+        # Add the expected result to the test
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"),
+            ans,
+            raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("clip.gguf")
+
+    # Create test cases for different shapes, strides, and data types
+    test_cases = []
+
+    # Test case shapes
+    shapes = [
+        (10,),                # 1D tensor
+        (5, 10),              # 2D tensor
+        (2, 3, 4),            # 3D tensor
+        (7, 13),              # Prime dimensions
+        (1, 1),               # Minimum shape
+        (100, 100),           # Large shape
+        (16, 16, 16),         # Large 3D
+    ]
+
+    # Test case min/max values
+    min_max_values = [
+        (-1.0, 1.0),          # Standard range
+        (0.0, 2.0),           # Positive range
+        (-2.0, 0.0),          # Negative range
+        (-1000.0, 1000.0),    # Large range
+        (-0.001, 0.001),      # Small range
+        (0.0, 0.0),           # min=max
+    ]
+
+    # Data types to test
+    dtypes = [np.float16, np.float32, np.float64]
+
+    # Generate test cases with contiguous tensors
+    for shape in shapes:
+        for min_val, max_val in min_max_values:
+            for dtype in dtypes:
+                x = random_tensor(shape, dtype)
+                min_tensor = np.full(shape, min_val, dtype=dtype)
+                max_tensor = np.full(shape, max_val, dtype=dtype)
+                y = np.zeros(shape, dtype=dtype)
+
+                test_cases.append(
+                    ClipTestCase(
+                        x=x,
+                        x_stride=None,
+                        min_val=min_tensor,
+                        min_stride=None,
+                        max_val=max_tensor,
+                        max_stride=None,
+                        y=y,
+                        y_stride=None
+                    )
+                )
+
+    # Generate test cases with strided tensors (for 2D shapes only)
+    for shape in [s for s in shapes if len(s) == 2]:
+        for dtype in dtypes:
+            # Row-major stride
+            row_stride = gguf_strides(shape[1], 1)
+            # Column-major stride
+            col_stride = gguf_strides(1, shape[0])
+
+            # Test case with row-major input and output
+            x = random_tensor(shape, dtype)
+            min_tensor = np.full(shape, -1.0, dtype=dtype)
+            max_tensor = np.full(shape, 1.0, dtype=dtype)
+            y = np.zeros(shape, dtype=dtype)
+
+            test_cases.append(
+                ClipTestCase(
+                    x=x,
+                    x_stride=row_stride,
+                    min_val=min_tensor,
+                    min_stride=row_stride,
+                    max_val=max_tensor,
+                    max_stride=row_stride,
+                    y=y,
+                    y_stride=row_stride
+                )
+            )
+
+            # Test case with column-major input and output
+            x = random_tensor(shape, dtype)
+            min_tensor = np.full(shape, -1.0, dtype=dtype)
+            max_tensor = np.full(shape, 1.0, dtype=dtype)
+            y = np.zeros(shape, dtype=dtype)
+
+            test_cases.append(
+                ClipTestCase(
+                    x=x,
+                    x_stride=col_stride,
+                    min_val=min_tensor,
+                    min_stride=col_stride,
+                    max_val=max_tensor,
+                    max_stride=col_stride,
+                    y=y,
+                    y_stride=col_stride
+                )
+            )
+
+            # Test case with different strides for input and output
+            x = random_tensor(shape, dtype)
+            min_tensor = np.full(shape, -1.0, dtype=dtype)
+            max_tensor = np.full(shape, 1.0, dtype=dtype)
+            y = np.zeros(shape, dtype=dtype)
+
+            test_cases.append(
+                ClipTestCase(
+                    x=x,
+                    x_stride=row_stride,
+                    min_val=min_tensor,
+                    min_stride=row_stride,
+                    max_val=max_tensor,
+                    max_stride=row_stride,
+                    y=y,
+                    y_stride=col_stride
+                )
+            )
+
+    # Add all test cases to the writer
+    test_writer.add_tests(test_cases)
+
+    # Save the test cases to a GGUF file
+    test_writer.save()
+
+    print(f"Generated {len(test_cases)} test cases for the Clip operator")
--- a/test/infiniop-test/test_generate/testcases/mul.py
+++ b/test/infiniop-test/test_generate/testcases/mul.py
+import numpy as np
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+def mul(
+    a: np.ndarray,
+    b: np.ndarray
+):
+    return np.multiply(a, b)
+
+def random_tensor(shape, dtype):
+    rate = 1e-3
+    var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
+    return rate * np.random.rand(*shape).astype(dtype) - var
+
+class MulTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        shape_a: List[int] | None,        
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        shape_b: List[int] | None,       
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        shape_c: List[int] | None,    
+        stride_c: List[int] | None,
+    ):
+        super().__init__("mul")
+        self.a = a
+        self.shape_a = shape_a
+        self.stride_a = stride_a
+        self.b = b
+        self.shape_b = shape_b
+        self.stride_b = stride_b
+        self.c = c
+        self.shape_c = shape_c
+        self.stride_c = stride_c
+
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.shape_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
+        if self.shape_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
+        if self.shape_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+        test_writer.add_array(
+            test_writer.gguf_key("c.strides"),
+            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        a_fp64 = self.a.astype(np.float64)
+        b_fp64 = self.b.astype(np.float64)
+        
+        ans_fp64 = np.multiply(a_fp64, b_fp64)
+        ans = mul(self.a, self.b)
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans_fp64"),
+            ans_fp64,
+            raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
+        )
+
+if __name__ == '__main__':
+    test_writer = InfiniopTestWriter("mul.gguf")
+    test_cases = []
+
+    _TEST_CASES_ = [
+        ((2, 3), (3, 1), (1, 2), (3, 1)),
+        ((2, 3), (1, 2), (3, 1), (1, 2)),
+        ((2, 3), (3, 1), (3, 1), (1, 2)),
+        ((4, 6), (1, 4), (1, 5), (6, 1)),
+        ((1, 2048), (1, 1), (2048, 1), (1, 1)),
+        ((2048, 2048), None, (1, 2048), None),
+        ((2, 4, 2048), (4 * 2048, 2048, 1), (1, 2, 8), (4 * 2048, 2048, 1)),
+        ((2, 4, 2048), (1, 2, 8), None, (1, 2, 8)),
+        ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)),
+        ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None),
+        ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)),
+    ]   
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+    
+    for dtype in _TENSOR_DTYPES_:
+        for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
+            a = random_tensor(shape, dtype)
+            b = random_tensor(shape, dtype)
+            c = np.empty(tuple(0 for _ in shape), dtype=dtype)
+
+                
+            test_cases.append(
+                MulTestCase(
+                    a=a,
+                    shape_a=shape,
+                    stride_a=stride_a,
+                    b=b,
+                    shape_b=shape,
+                    stride_b=stride_b,
+                    c=c,
+                    shape_c=shape,
+                    stride_c=stride_c,
+                )
+            )   
+    
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop-test/test_generate/testcases/swiglu.py
+++ b/test/infiniop-test/test_generate/testcases/swiglu.py
+import numpy as np
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+
+def swiglu(
+    a: np.ndarray,
+    b: np.ndarray,
+):
+    c = a * b / (1.0 + np.exp(-b))
+
+    return c
+
+
+class SwiGLUTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        shape_a: List[int] | None,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        shape_b: List[int] | None,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        shape_c: List[int] | None,
+        stride_c: List[int] | None,
+
+    ):
+        super().__init__("swiglu")
+        self.a = a
+        self.shape_a = shape_a
+        self.stride_a = stride_a
+        self.b = b
+        self.shape_b = shape_b
+        self.stride_b = stride_b
+        self.c = c
+        self.shape_c = shape_c
+        self.stride_c = stride_c
+
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.shape_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
+        if self.shape_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
+        if self.shape_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)  
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+        test_writer.add_array(
+            test_writer.gguf_key("c.strides"),
+            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        ans = swiglu(
+            self.a.astype(np.float64),
+            self.b.astype(np.float64),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("swiglu.gguf")
+    test_cases = []
+
+    _TEST_CASES_ = [
+        ((64, 128), None, None, None),
+        ((64, 121), None, None, None),
+        ((15, 512), None, None, None),
+        ((13, 4), None, None, None),
+        ((13, 4), (10, 1), (10, 1), (10, 1)),
+        ((13, 4, 4), None, None, None),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+        ((16, 5632), None, None, None),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+        ((16, 5632), (5632, 1), (5632, 1), (1, 16)),
+        ((2, 3, 400), (1200, 400, 1), (1200, 400, 1), (1, 2, 6)),
+        ((4, 4, 5632), None, None, None),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
+            a = np.random.rand(*shape).astype(dtype)
+            b = np.random.rand(*shape).astype(dtype)
+            c = np.empty(tuple(0 for _ in shape), dtype=dtype)
+            test_case = SwiGLUTestCase(
+                a=a,
+                shape_a=list(shape),
+                stride_a=stride_a,
+                b=b,
+                shape_b=list(shape),
+                stride_b=stride_b,
+                c=c,
+                shape_c=list(shape),
+                stride_c=stride_c,
+            )
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
-from ctypes import POINTER, Structure, c_int32, c_void_p
+import torch
 import ctypes
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from libinfiniop import (
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
    check_error,
+    rearrange_if_needed,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    create_workspace,
 )
-
-from operatorspy.tests.test_utils import get_args
 from enum import Enum, auto
-import torch
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]


 class Inplace(Enum):
@@ -26,6 +43,35 @@ class Inplace(Enum):
    INPLACE_B = auto()


+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
 class AddDescriptor(Structure):
    _fields_ = [("device", c_int32)]

@@ -37,42 +83,71 @@ def add(x, y):
    return torch.add(x, y)


+def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
+    """
+    rearrange the tensors if needed and apply the inplace config.
+    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
+    the inplace config is ignored and out-of-place is used
+    """
+    original_c_strides = c_strides if c_strides else c.stride()
+
+    def _rearrange(tensor, strides):
+        if strides and 0 in strides:
+            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
+            return tensor
+        else:
+            return rearrange_if_needed(tensor, strides)
+
+    a, b, c = [
+        _rearrange(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
+    ]
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
+    if 0 in c.stride():
+        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
+
+    return a, b, c
+
+
 def test(
    lib,
    handle,
    torch_device,
-    c_shape,
-    a_shape,
-    b_shape,
-    tensor_dtype=torch.float16,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
 ):
    print(
-        f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+        f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{dtype} inplace:{inplace}"
    )
-    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
-        print("Unsupported test: broadcasting does not support in-place")
-        return

-    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
-    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
-    c = (
-        torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
-        if inplace == Inplace.OUT_OF_PLACE
-        else (a if inplace == Inplace.INPLACE_A else b)
-    )
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
+    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)

    ans = add(a, b)

-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
    c_tensor = (
        to_tensor(c, lib)
        if inplace == Inplace.OUT_OF_PLACE
        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
    )
-    descriptor = infiniopAddDescriptor_t()
+    if sync is not None:
+        sync()

+    descriptor = infiniopAddDescriptor_t()
    check_error(
        lib.infiniopCreateAddDescriptor(
            handle,
@@ -84,74 +159,48 @@ def test(
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    c_tensor.descriptor.contents.invalidate()
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
+    for tensor in [a_tensor, b_tensor, c_tensor]:
+        tensor.destroyDesc(lib)

+    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+        lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
    )
-    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for c_shape, a_shape, b_shape, inplace in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for c_shape, a_shape, b_shape, inplace in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
+    workspace = create_workspace(workspace_size.value, c.device)
+
+    def lib_add():
+        check_error(
+            lib.infiniopAdd(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                None,
+            )
+        )

+    lib_add()

-def test_bang(lib, test_cases):
-    import torch_mlu
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)

-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for c_shape, a_shape, b_shape, inplace in test_cases:
+    # Profiling workflow
+    if PROFILE:
        # fmt: off
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        profile_operation("PyTorch", lambda: add(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    destroy_handle(lib, handle)
+    check_error(lib.infiniopDestroyAddDescriptor(descriptor))


 if __name__ == "__main__":
-    test_cases = [
-        # fmt: off
-        # c_shape, a_shape, b_shape, inplace
-        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
-        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
-        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
-        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
-        ((), (), (), Inplace.OUT_OF_PLACE),
-        ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
-        ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
-        ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
-        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
-        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
-        # fmt: on
-    ]
    args = get_args()
    lib = open_lib()
+
    lib.infiniopCreateAddDescriptor.restype = c_int32
    lib.infiniopCreateAddDescriptor.argtypes = [
        infiniopHandle_t,
@@ -160,25 +209,36 @@ if __name__ == "__main__":
        infiniopTensorDescriptor_t,
        infiniopTensorDescriptor_t,
    ]
+
+    lib.infiniopGetAddWorkspaceSize.restype = c_int32
+    lib.infiniopGetAddWorkspaceSize.argtypes = [
+        infiniopAddDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
    lib.infiniopAdd.restype = c_int32
    lib.infiniopAdd.argtypes = [
        infiniopAddDescriptor_t,
        c_void_p,
+        c_uint64,
+        c_void_p,
        c_void_p,
        c_void_p,
        c_void_p,
    ]
+
    lib.infiniopDestroyAddDescriptor.restype = c_int32
    lib.infiniopDestroyAddDescriptor.argtypes = [
        infiniopAddDescriptor_t,
    ]

-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
 import ctypes
 import sys
 import os

 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
+from libinfiniop import (
    open_lib,
    to_tensor,
-    CTensor,
-    DeviceEnum,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
    check_error,
    rearrange_tensor,
    create_workspace,
+    get_args,
+    get_test_devices,
+    test_operator,
+    debug,
+    get_tolerance,
+    profile_operation,
 )

-from operatorspy.tests.test_utils import get_args
 import torch
-import torch.nn.functional as F


 class AttentionDescriptor(Structure):
@@ -95,12 +95,13 @@ def test(
    pos,
    k_cache_buf_len,
    v_cache_buf_len,
-    dtype=torch.float16,
    q_stride=None,
    k_stride=None,
    v_stride=None,
    k_cache_stride=None,
    v_cache_stride=None,
+    dtype=torch.float16,
+    sync=None,
 ):
    print(
        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
@@ -140,6 +141,9 @@ def test(
    k_cache_tensor = to_tensor(k_cache, lib)
    v_cache_tensor = to_tensor(v_cache, lib)

+    if sync is not None:
+        sync()
+
    descriptor = infiniopAttentionDescriptor_t()
    check_error(
        lib.infiniopCreateAttentionDescriptor(
@@ -156,12 +160,15 @@ def test(
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    out_tensor.descriptor.contents.invalidate()
-    q_tensor.descriptor.contents.invalidate()
-    k_tensor.descriptor.contents.invalidate()
-    v_tensor.descriptor.contents.invalidate()
-    k_cache_tensor.descriptor.contents.invalidate()
-    v_cache_tensor.descriptor.contents.invalidate()
+    for tensor in [
+        out_tensor,
+        q_tensor,
+        k_tensor,
+        v_tensor,
+        k_cache_tensor,
+        v_cache_tensor,
+    ]:
+        tensor.destroyDesc(lib)

    workspace_size = c_uint64(0)
    check_error(
@@ -169,152 +176,52 @@ def test(
    )
    workspace = create_workspace(workspace_size.value, out.device)

-    check_error(
-        lib.infiniopAttention(
-            descriptor,
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            out_tensor.data,
-            q_tensor.data,
-            k_tensor.data,
-            v_tensor.data,
-            k_cache_tensor.data,
-            v_cache_tensor.data,
-            None,
-        )
-    )
-
-    assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
-
-    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-
-    for (
-        n_q_head,
-        n_kv_head,
-        seq_len,
-        head_dim,
-        pos,
-        k_cache_buf_len,
-        v_cache_buf_len,
-        dtype,
-        q_stride,
-        k_stride,
-        v_stride,
-        k_cache_stride,
-        v_cache_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cpu",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
+    def lib_attention():
+        check_error(
+            lib.infiniopAttention(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                out_tensor.data,
+                q_tensor.data,
+                k_tensor.data,
+                v_tensor.data,
+                k_cache_tensor.data,
+                v_cache_tensor.data,
+                None,
+            )
        )

-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-
-    for (
-        n_q_head,
-        n_kv_head,
-        seq_len,
-        head_dim,
-        pos,
-        k_cache_buf_len,
-        v_cache_buf_len,
-        dtype,
-        q_stride,
-        k_stride,
-        v_stride,
-        k_cache_stride,
-        v_cache_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cuda",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
+    lib_attention()

-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-
-    for (
-        n_q_head,
-        n_kv_head,
-        seq_len,
-        head_dim,
-        pos,
-        k_cache_buf_len,
-        v_cache_buf_len,
-        dtype,
-        q_stride,
-        k_stride,
-        v_stride,
-        k_cache_stride,
-        v_cache_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "mlu",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
+    # Validate results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(out, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(out, ans, atol=atol, rtol=rtol)

-    destroy_handle(lib, handle)
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))


 if __name__ == "__main__":
+    _TENSOR_DTYPES = [torch.float16, torch.float32]
+
+    # Tolerance map for different data types
+    _TOLERANCE_MAP = {
+        torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+        torch.float32: {"atol": 1e-5, "rtol": 1e-3},
+    }
+
+    DEBUG = False
+    PROFILE = False
+    NUM_PRERUN = 10
+    NUM_ITERATIONS = 1000
    test_cases = [
        # prefill
        (
@@ -325,7 +232,6 @@ if __name__ == "__main__":
            0,  # pos
            2048,  # k_cache_buf_len
            2048,  # v_cache_buf_len
-            torch.float16,  # dtype
            [64, 2560, 1],  # q_stride
            [64, 2560, 1],  # k_stride
            [64, 2560, 1],  # v_stride
@@ -341,7 +247,6 @@ if __name__ == "__main__":
            3,  # pos
            2048,  # k_cache_buf_len
            2048,  # v_cache_buf_len
-            torch.float16,  # dtype
            [64, 2560, 1],  # q_stride
            [64, 2560, 1],  # k_stride
            [64, 2560, 1],  # v_stride
@@ -357,13 +262,26 @@ if __name__ == "__main__":
            1,  # pos
            8,  # k_cache_buf_len
            8,  # v_cache_buf_len
-            torch.float16,  # dtype
            None,  # q_stride
            None,  # k_stride
            None,  # v_stride
            None,  # k_cache_stride
            None,  # v_cache_stride
        ),
+        (
+            28,  # n_q_head
+            28,  # n_kv_head
+            15,  # seq_len
+            128,  # head_dim
+            0,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            [128, 10752, 1],  # q_stride
+            [128, 10752, 1],  # k_stride
+            [128, 10752, 1],  # v_stride
+            [128, 3584, 1],  # k_cache_stride
+            [128, 3584, 1],  # v_cache_stride
+        ),
    ]
    args = get_args()
    lib = open_lib()
@@ -406,12 +324,13 @@ if __name__ == "__main__":
        infiniopAttentionDescriptor_t,
    ]

-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
@@ -88,6 +88,7 @@ def test(
    padding,
    strides,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
@@ -109,6 +110,10 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
+    
+    if sync is not None:
+        sync()
+
    descriptor = infiniopAvgPoolDescriptor_t()

    check_error(

--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -16,18 +16,20 @@ from libinfiniop import (
    get_tolerance,
    profile_operation,
 )
+from enum import Enum, auto

 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
-_TEST_CASES = [
-    # x_shape, x_stride
-    ((32, 512), None),
-    ((32, 512), (1024, 1)),
-    ((32, 5, 5), None),
-    ((32, 20, 512), None),
-    ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride
+    ((3, 3), None, None),
+    ((32, 512), None, None),
+    ((32, 512), (1024, 1), (1024, 1)),
+    ((32, 5, 5), None, None),
+    ((32, 20, 512), None, None),
+    ((32, 20, 512), (20480, 512, 1), None),
 ]

 # Data types used for testing
@@ -35,9 +37,26 @@ _TENSOR_DTYPES = [torch.float16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
 }

+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.INPLACE_X,
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
@@ -59,12 +78,22 @@ def causal_softmax(x):
    return torch.nn.functional.softmax(masked, dim=-1).to(type)


-def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16):
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None
+):
    print(
-        f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{dtype}"
+        f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
    )

-    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
+    x = torch.rand(shape, dtype=dtype).to(torch_device)

    ans = causal_softmax(x)

@@ -72,10 +101,21 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16)

    x_tensor = to_tensor(x, lib)

+    if inplace == Inplace.INPLACE_X:
+        y = x
+        y_tensor = x_tensor
+    else:
+        y = torch.zeros(shape, dtype=dtype).to(torch_device)
+        y = rearrange_if_needed(y, y_stride)
+        y_tensor = to_tensor(y, lib)
+        
+    if sync is not None:
+        sync()
+
    descriptor = infiniopCausalSoftmaxDescriptor_t()
    check_error(
        lib.infiniopCreateCausalSoftmaxDescriptor(
-            handle, ctypes.byref(descriptor), x_tensor.descriptor
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
        )
    )

@@ -96,17 +136,21 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16)
                descriptor,
                workspace.data_ptr() if workspace is not None else None,
                workspace_size.value,
+                y_tensor.data,
                x_tensor.data,
                None,
            )
        )

    lib_causal_softmax()
+    
+    if sync is not None:
+        sync() 

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
-        debug(x, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(x, ans, atol=atol, rtol=rtol)
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)

    # Profiling workflow
    if PROFILE:

--- a/test/infiniop/clip.py
+++ b/test/infiniop/clip.py
+#!/usr/bin/env python3
+
+import torch
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    open_lib,
+    to_tensor,
+    get_test_devices,
+    check_error,
+    rearrange_if_needed,
+    create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride, min_val, max_val
+    # 基本形状测试
+    ((10,), None, None, -1.0, 1.0),
+    ((5, 10), None, None, -1.0, 1.0),
+    ((2, 3, 4), None, None, -1.0, 1.0),
+    # 不同的min_val和max_val
+    ((10,), None, None, 0.0, 2.0),
+    ((5, 10), None, None, 0.0, 2.0),
+    ((2, 3, 4), None, None, 0.0, 2.0),
+    ((10,), None, None, -2.0, 0.0),
+    ((5, 10), None, None, -2.0, 0.0),
+    ((2, 3, 4), None, None, -2.0, 0.0),
+    # 奇怪形状测试
+    ((7, 13), None, None, -1.0, 1.0),     # 质数维度
+    ((3, 5, 7), None, None, -1.0, 1.0),   # 三维质数
+    # 非标准形状测试
+    ((1, 1), None, None, -1.0, 1.0),       # 最小形状
+    ((100, 100), None, None, -1.0, 1.0),   # 大形状
+    ((16, 16, 16), None, None, -1.0, 1.0), # 大三维
+    # 极端值测试
+    ((10,), None, None, -1000.0, 1000.0),  # 大范围
+    ((10,), None, None, -0.001, 0.001),    # 小范围
+    ((10,), None, None, 0.0, 0.0),         # min=max
+    # 特殊形状测试
+    ((0,), None, None, -1.0, 1.0),         # 空张量
+    ((1, 0), None, None, -1.0, 1.0),       # 空维度
+
+]
+
+
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 1e-7, "rtol": 1e-6},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.INPLACE_X,
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ClipDescriptor(Structure):
+    _fields_ = [("device_type", c_int32), ("device_id", c_int32)]
+infiniopClipDescriptor_t = POINTER(ClipDescriptor)
+
+
+def clip(x, min_val, max_val):
+    return torch.clamp(x, min_val, max_val)
+
+
+def create_tensor_with_stride(shape, stride, dtype, device):
+    """Create a tensor with specific stride without using view() that might cause errors."""
+    x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0  # Range: [-2, 2]
+    if stride is None:
+        return x
+    if len(shape) == 2 and len(stride) == 2:
+        if stride == (shape[1], 1):
+            return x.contiguous()
+        elif stride == (1, shape[0]):
+            return x.transpose(0, 1).contiguous().transpose(0, 1)
+        else:
+            y = torch.zeros(shape, dtype=dtype, device=device)
+            for i in range(shape[0]):
+                for j in range(shape[1]):
+                    y[i, j] = x[i, j]
+            return y.contiguous()
+    return x
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    min_val=-1.0,
+    max_val=1.0,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+):
+    print(
+        f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
+    )
+    x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
+    ans = clip(x, min_val, max_val)
+    x = rearrange_if_needed(x, x_stride)
+    x_tensor = to_tensor(x, lib)
+    if inplace == Inplace.INPLACE_X:
+        y = x
+        y_tensor = x_tensor
+    else:
+        y = torch.zeros(shape, dtype=dtype).to(torch_device)
+        y = rearrange_if_needed(y, y_stride)
+        y_tensor = to_tensor(y, lib)
+    descriptor = infiniopClipDescriptor_t()
+    check_error(
+        lib.infiniopCreateClipDescriptor(
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        )
+    )
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetClipWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, x.device)
+
+    def lib_clip():
+        check_error(
+            lib.infiniopClip(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                y_tensor.data,
+                x_tensor.data,
+                c_float(min_val),
+                c_float(max_val),
+                None,
+            )
+        )
+
+    lib_clip()
+
+    # Now we can destroy the tensor descriptors
+    x_tensor.destroyDesc(lib)
+    if inplace != Inplace.INPLACE_X:
+        y_tensor.destroyDesc(lib)
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
+        print("\nExpected:")
+        print(ans)
+        print("\nActual:")
+        print(y)
+        print("\nDifference:")
+        print(torch.abs(y - ans))
+        print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopClipDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetClipWorkspaceSize.restype = c_int32
+    lib.infiniopGetClipWorkspaceSize.argtypes = [
+        infiniopClipDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopClipDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_float,
+        c_float,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [
+        infiniopClipDescriptor_t,
+    ]
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -95,6 +95,7 @@ def test(
    dilations,
    tensor_stride=None,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    assert len(pads) == len(strides) == len(dilations)
    print(
@@ -118,8 +119,11 @@ def test(
    x_tensor = to_tensor(x, lib)
    w_tensor = to_tensor(w, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopConvDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopConvDescriptor_t()
    check_error(
        lib.infiniopCreateConvDescriptor(
            handle,

--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
@@ -52,6 +52,7 @@ def test(
    y_stride=None,
    x_stride=None,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
@@ -76,8 +77,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopExpandDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopExpandDescriptor_t()
    check_error(
        lib.infiniopCreateExpandDescriptor(
            handle,

--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
@@ -83,6 +83,7 @@ def test(
    b_stride=None,
    c_stride=None,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
@@ -104,6 +105,9 @@ def test(
    ]
    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]

+    if sync is not None:
+        sync()
+
    descriptor = infiniopGemmDescriptor_t()
    check_error(
        lib.infiniopCreateGemmDescriptor(

--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
@@ -51,6 +51,7 @@ def test(
    torch_device,
    x_shape,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
@@ -70,8 +71,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
    check_error(
        lib.infiniopCreateGlobalAvgPoolDescriptor(
            handle,

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -10,7 +10,7 @@ def check_error(status):
        raise Exception("Error code " + str(status))


-def to_tensor(tensor, lib):
+def to_tensor(tensor, lib, force_unsigned=False):
    """
    Convert a PyTorch tensor to a library Tensor(descriptor, data).
    """
@@ -37,6 +37,16 @@ def to_tensor(tensor, lib):
        InfiniDtype.U64 if tensor.dtype == torch.uint64 else
        None
    )
+    
+    if force_unsigned:
+        dt = (
+            InfiniDtype.U8 if dt == InfiniDtype.I8 else
+            InfiniDtype.U16 if dt == InfiniDtype.I16 else
+            InfiniDtype.U32 if dt == InfiniDtype.I32 else
+            InfiniDtype.U64 if dt == InfiniDtype.I64 else
+            dt
+        )
+
    # fmt: on
    assert dt is not None
    # Create TensorDecriptor
@@ -413,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
                    infiniDeviceEnum_str_map[device],
                    *test_case,
                    tensor_dtype,
+                    get_sync_func(device)
                )
    finally:
        destroy_handle(lib, handle)
@@ -461,3 +472,15 @@ def get_test_devices(args):
        devices_to_test = [InfiniDeviceEnum.CPU]

    return devices_to_test
+
+
+def get_sync_func(device):
+    import torch
+    device_str = infiniDeviceEnum_str_map[device]
+    
+    if device == InfiniDeviceEnum.CPU:
+        sync = None
+    else:
+        sync = getattr(torch, device_str).synchronize
+    
+    return sync
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
@@ -83,6 +83,7 @@ def test(
    padding,
    strides,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
@@ -104,8 +105,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopMaxPoolDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopMaxPoolDescriptor_t()
    check_error(
        lib.infiniopCreateMaxPoolDescriptor(
            handle,