Merge remote-tracking branch 'origin/main' into issue/204

8b59f4fe · Catheriany · 16506fc0 · df1c6b5d · 8b59f4fe · 8b59f4fe
Commit 8b59f4fe authored May 20, 2025 by Catheriany
20 changed files
--- a/src/infiniop/reduce/kunlun/reduce_kunlun.h
+++ b/src/infiniop/reduce/kunlun/reduce_kunlun.h
 #ifndef __INFINIOP_REDUCE_KUNLUN_H__
 #define __INFINIOP_REDUCE_KUNLUN_H__

-#include "../../devices/kunlun/kunlun_common.h"
+#include "../../devices/kunlun/kunlun_kernel_common.h"

 namespace op::common_kunlun::reduce_op {

+using namespace device::kunlun::kernel;
+
 // Use 16 floats instruction to calculate reduce
 // data_ptr is the pointer of LM
 static inline __device__ float sumSquaredF32(float *data_ptr, int count) {

--- a/src/infiniop/tensor.h
+++ b/src/infiniop/tensor.h
@@ -2,9 +2,19 @@
 #define __INFINIOP_TENSOR_H__

 #include "infiniop/tensor_descriptor.h"
+
+#include "../utils.h"
+
 #include <string>
 #include <vector>

+#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
+    do {                                               \
+        auto __RESULT__ = __TENSOR_DESC__->__OP__;     \
+        CHECK_RESULT(__RESULT__);                      \
+        __TENSOR_DESC__ = __RESULT__.take();           \
+    } while (0)
+
 struct InfiniopTensorDescriptor {
 private:
    // Datatype
@@ -32,9 +42,9 @@ public:
    bool hasBroadcastDim() const;
    std::vector<size_t> getBroadcastDim() const;

-    infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const;
-    infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const;
-    infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const;
+    utils::Result<infiniopTensorDescriptor_t> dimMerge(size_t dim_start, size_t dim_end) const;
+    utils::Result<infiniopTensorDescriptor_t> dimSplit(size_t axis, const std::vector<size_t> &dims) const;
+    utils::Result<infiniopTensorDescriptor_t> dimPermute(const std::vector<size_t> &order) const;

    std::string toString() const;
 };

--- a/src/infiniop/tensor_descriptor.cc
+++ b/src/infiniop/tensor_descriptor.cc
@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
        std::vector<ptrdiff_t> strides(ndim);
        ptrdiff_t dsize = 1;
        if (ndim > 0) {
-            for (size_t i = ndim - 1; i >= 0; i--) {
+            for (int i = (int)ndim - 1; i >= 0; i--) {
                strides[i] = dsize;
                dsize *= shape_[i];
            }
@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
    return res;
 }

-infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
-    if (dim_start > dim_end || dim_end >= ndim()) {
-        return nullptr;
-    }
+utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
+    CHECK_OR_RETURN(dim_start <= dim_end && dim_end < ndim(), INFINI_STATUS_BAD_PARAM);

    size_t new_ndim = ndim() - (dim_end - dim_start);
    std::vector<size_t> new_shape(new_ndim);
@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
        index++;
    }

-    if (!isContiguous(dim_start, dim_end)) {
-        return nullptr;
-    }
+    CHECK_OR_RETURN(isContiguous(dim_start, dim_end), INFINI_STATUS_BAD_PARAM);

    new_shape[index] = 1;
    for (size_t i = dim_start; i <= dim_end; i++) {
@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
        index++;
    }

-    return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data());
+    return utils::Result<infiniopTensorDescriptor_t>(
+        new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
 }

-infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
+utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
    size_t ndim_ = ndim();

-    if (dim(axis) != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>())) {
-        return nullptr;
-    }
+    CHECK_OR_RETURN(dim(axis) == std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>()),
+                    INFINI_STATUS_BAD_PARAM);

    size_t new_ndim = ndim_ + dims.size() - 1;
    std::vector<size_t> new_shape(new_ndim);
@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
        index++;
    }

-    return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data());
+    return utils::Result<infiniopTensorDescriptor_t>(
+        new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
 }

-infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
+utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
    auto ndim_ = ndim();
-    if (order.size() != ndim_) {
-        return nullptr;
-    }
+    CHECK_OR_RETURN(order.size() == ndim_, INFINI_STATUS_BAD_PARAM);
    std::vector<size_t> new_shape(ndim_);
    std::vector<ptrdiff_t> new_strides(ndim_);
    for (size_t i = 0; i < ndim_; i++) {
-        if (std::find(order.begin(), order.end(), i) == order.end()) {
-            return nullptr;
-        }
+        CHECK_OR_RETURN(std::find(order.begin(), order.end(), i) != order.end(), INFINI_STATUS_BAD_PARAM);
        new_shape[i] = dim(order[i]);
        new_strides[i] = stride(order[i]);
    }
-    return new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data());
+    return utils::Result<infiniopTensorDescriptor_t>(
+        new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()));
 }

 std::string InfiniopTensorDescriptor::toString() const {

--- a/src/infinirt/bang/infinirt_bang.cc
+++ b/src/infinirt/bang/infinirt_bang.cc
@@ -6,7 +6,8 @@

 namespace infinirt::bang {
 infiniStatus_t getDeviceCount(int *count) {
-    CHECK_BANGRT(cnrtGetDeviceCount(count));
+    unsigned int device_count = static_cast<unsigned int>(*count);
+    CHECK_BANGRT(cnrtGetDeviceCount(&device_count));
    return INFINI_STATUS_SUCCESS;
 }

@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {

 infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
    cnrtQueue_t queue;
-    CHECK_BANGRT(cnrtQueueCreate(&stream));
+    CHECK_BANGRT(cnrtQueueCreate(&queue));
    *stream_ptr = queue;
    return INFINI_STATUS_SUCCESS;
 }
@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
 }

 infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
-    auto status = cnrtQueryNotifier((cnrtQueue_t)stream);
+    auto status = cnrtQueryNotifier((cnrtNotifier_t)event);
    if (status == cnrtSuccess) {
        *status_ptr = INFINIRT_EVENT_COMPLETE;
    } else if (status == cnrtErrorBusy) {
@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
 }

 infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
-    CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind)));
+    CHECK_BANGRT(cnrtMemcpy(dst, (void *)src, size, toBangMemcpyKind(kind)));
    return INFINI_STATUS_SUCCESS;
 }

 infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
-    CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
+    CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, (void *)src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
    return INFINI_STATUS_SUCCESS;
 }


--- a/src/infinirt/bang/infinirt_bang.h
+++ b/src/infinirt/bang/infinirt_bang.h
@@ -3,7 +3,7 @@
 #include "../infinirt_impl.h"

 namespace infinirt::bang {
-#ifdef ENABLE_BANG_API
+#ifdef ENABLE_CAMBRICON_API
 INFINIRT_DEVICE_API_IMPL
 #else
 INFINIRT_DEVICE_API_NOOP

--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
@@ -4,6 +4,7 @@
 #include "bang/infinirt_bang.h"
 #include "cpu/infinirt_cpu.h"
 #include "cuda/infinirt_cuda.cuh"
+#include "kunlun/infinirt_kunlun.h"
 #include "maca/infinirt_maca.h"
 #include "musa/infinirt_musa.h"

@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
        case INFINI_DEVICE_MOORE:                                      \
            _status = infinirt::musa::API PARAMS;                      \
            break;                                                     \
+        case INFINI_DEVICE_KUNLUN:                                     \
+            _status = infinirt::kunlun::API PARAMS;                    \
+            break;                                                     \
        default:                                                       \
-            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;            \
+            _status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;         \
        }                                                              \
        { ACTION; }                                                    \
        return _status;                                                \

--- a/test/infiniop-test/test_generate/testcases/swiglu.py
+++ b/test/infiniop-test/test_generate/testcases/swiglu.py
+import numpy as np
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+
+
+def swiglu(
+    a: np.ndarray,
+    b: np.ndarray,
+):
+    c = a * b / (1.0 + np.exp(-b))
+
+    return c
+
+
+class SwiGLUTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        stride_c: List[int] | None,
+    ):
+        super().__init__("swiglu")
+        self.a = a
+        self.stride_a = stride_a
+        self.b = b
+        self.stride_b = stride_b
+        self.c = c
+        self.stride_c = stride_c
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
+        if self.stride_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        ans = swiglu(
+            self.a.astype(np.float64),
+            self.b.astype(np.float64),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("swiglu.gguf")
+    test_cases = [
+        SwiGLUTestCase(
+            np.random.rand(64, 128).astype(np.float32),
+            None,
+            np.random.rand(64, 128).astype(np.float32),
+            None,
+            np.random.rand(64, 128).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(64, 121).astype(np.float32),
+            None,
+            np.random.rand(64, 121).astype(np.float32),
+            None,
+            np.random.rand(64, 121).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(15, 512).astype(np.float32),
+            None,
+            np.random.rand(15, 512).astype(np.float32),
+            None,
+            np.random.rand(15, 512).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float32),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float32),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float32),
+            gguf_strides(10, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float16),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float16),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float16),
+            gguf_strides(10, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float32),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float32),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float32),
+            gguf_strides(20, 4, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float16),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float16),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float16),
+            gguf_strides(20, 4, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float32),
+            None,
+            np.random.rand(16, 5632).astype(np.float32),
+            None,
+            np.random.rand(16, 5632).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float16),
+            None,
+            np.random.rand(16, 5632).astype(np.float16),
+            None,
+            np.random.rand(16, 5632).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(13312, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(13312, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(1, 16),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(1, 16),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(2, 3, 400).astype(np.float32),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float32),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float32),
+            gguf_strides(1, 2, 6),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(2, 3, 400).astype(np.float16),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float16),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float16),
+            gguf_strides(1, 2, 6),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            gguf_strides(45056, 5632, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            gguf_strides(45056, 5632, 1),
+        ),
+    ]
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
 import ctypes
 import sys
 import os

 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
+from libinfiniop import (
    open_lib,
    to_tensor,
-    CTensor,
-    DeviceEnum,
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
    check_error,
    rearrange_tensor,
    create_workspace,
+    get_args,
+    get_test_devices,
+    test_operator,
+    debug,
+    get_tolerance,
+    profile_operation,
 )

-from operatorspy.tests.test_utils import get_args
 import torch
-import torch.nn.functional as F


 class AttentionDescriptor(Structure):
@@ -95,12 +95,13 @@ def test(
    pos,
    k_cache_buf_len,
    v_cache_buf_len,
-    dtype=torch.float16,
    q_stride=None,
    k_stride=None,
    v_stride=None,
    k_cache_stride=None,
    v_cache_stride=None,
+    dtype=torch.float16,
+    sync=None,
 ):
    print(
        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
@@ -140,6 +141,9 @@ def test(
    k_cache_tensor = to_tensor(k_cache, lib)
    v_cache_tensor = to_tensor(v_cache, lib)

+    if sync is not None:
+        sync()
+
    descriptor = infiniopAttentionDescriptor_t()
    check_error(
        lib.infiniopCreateAttentionDescriptor(
@@ -156,12 +160,15 @@ def test(
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    out_tensor.descriptor.contents.invalidate()
-    q_tensor.descriptor.contents.invalidate()
-    k_tensor.descriptor.contents.invalidate()
-    v_tensor.descriptor.contents.invalidate()
-    k_cache_tensor.descriptor.contents.invalidate()
-    v_cache_tensor.descriptor.contents.invalidate()
+    for tensor in [
+        out_tensor,
+        q_tensor,
+        k_tensor,
+        v_tensor,
+        k_cache_tensor,
+        v_cache_tensor,
+    ]:
+        tensor.destroyDesc(lib)

    workspace_size = c_uint64(0)
    check_error(
@@ -169,152 +176,52 @@ def test(
    )
    workspace = create_workspace(workspace_size.value, out.device)

-    check_error(
-        lib.infiniopAttention(
-            descriptor,
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            out_tensor.data,
-            q_tensor.data,
-            k_tensor.data,
-            v_tensor.data,
-            k_cache_tensor.data,
-            v_cache_tensor.data,
-            None,
+    def lib_attention():
+        check_error(
+            lib.infiniopAttention(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                out_tensor.data,
+                q_tensor.data,
+                k_tensor.data,
+                v_tensor.data,
+                k_cache_tensor.data,
+                v_cache_tensor.data,
+                None,
+            )
        )
-    )

-    assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
+    lib_attention()

-    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-
-    for (
-        n_q_head,
-        n_kv_head,
-        seq_len,
-        head_dim,
-        pos,
-        k_cache_buf_len,
-        v_cache_buf_len,
-        dtype,
-        q_stride,
-        k_stride,
-        v_stride,
-        k_cache_stride,
-        v_cache_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cpu",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
+    # Validate results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(out, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(out, ans, atol=atol, rtol=rtol)

-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-
-    for (
-        n_q_head,
-        n_kv_head,
-        seq_len,
-        head_dim,
-        pos,
-        k_cache_buf_len,
-        v_cache_buf_len,
-        dtype,
-        q_stride,
-        k_stride,
-        v_stride,
-        k_cache_stride,
-        v_cache_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cuda",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
-
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-
-    for (
-        n_q_head,
-        n_kv_head,
-        seq_len,
-        head_dim,
-        pos,
-        k_cache_buf_len,
-        v_cache_buf_len,
-        dtype,
-        q_stride,
-        k_stride,
-        v_stride,
-        k_cache_stride,
-        v_cache_stride,
-    ) in test_cases:
-        test(
-            lib,
-            handle,
-            "mlu",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
-
-    destroy_handle(lib, handle)
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))


 if __name__ == "__main__":
+    _TENSOR_DTYPES = [torch.float16, torch.float32]
+
+    # Tolerance map for different data types
+    _TOLERANCE_MAP = {
+        torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+        torch.float32: {"atol": 1e-6, "rtol": 1e-4},
+    }
+
+    DEBUG = False
+    PROFILE = False
+    NUM_PRERUN = 10
+    NUM_ITERATIONS = 1000
    test_cases = [
        # prefill
        (
@@ -325,7 +232,6 @@ if __name__ == "__main__":
            0,  # pos
            2048,  # k_cache_buf_len
            2048,  # v_cache_buf_len
-            torch.float16,  # dtype
            [64, 2560, 1],  # q_stride
            [64, 2560, 1],  # k_stride
            [64, 2560, 1],  # v_stride
@@ -341,7 +247,6 @@ if __name__ == "__main__":
            3,  # pos
            2048,  # k_cache_buf_len
            2048,  # v_cache_buf_len
-            torch.float16,  # dtype
            [64, 2560, 1],  # q_stride
            [64, 2560, 1],  # k_stride
            [64, 2560, 1],  # v_stride
@@ -357,7 +262,6 @@ if __name__ == "__main__":
            1,  # pos
            8,  # k_cache_buf_len
            8,  # v_cache_buf_len
-            torch.float16,  # dtype
            None,  # q_stride
            None,  # k_stride
            None,  # v_stride
@@ -406,12 +310,13 @@ if __name__ == "__main__":
        infiniopAttentionDescriptor_t,
    ]

-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
@@ -88,6 +88,7 @@ def test(
    padding,
    strides,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
@@ -109,6 +110,10 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
+    
+    if sync is not None:
+        sync()
+
    descriptor = infiniopAvgPoolDescriptor_t()

    check_error(

--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]

 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
 }


@@ -87,6 +87,7 @@ def test(
    y_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
@@ -107,6 +108,9 @@ def test(
        y = torch.zeros(shape, dtype=dtype).to(torch_device)
        y = rearrange_if_needed(y, y_stride)
        y_tensor = to_tensor(y, lib)
+        
+    if sync is not None:
+        sync()

    descriptor = infiniopCausalSoftmaxDescriptor_t()
    check_error(
@@ -139,6 +143,9 @@ def test(
        )

    lib_causal_softmax()
+    
+    if sync is not None:
+        sync() 

    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:

--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -95,6 +95,7 @@ def test(
    dilations,
    tensor_stride=None,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    assert len(pads) == len(strides) == len(dilations)
    print(
@@ -118,8 +119,11 @@ def test(
    x_tensor = to_tensor(x, lib)
    w_tensor = to_tensor(w, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopConvDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopConvDescriptor_t()
    check_error(
        lib.infiniopCreateConvDescriptor(
            handle,

--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
@@ -52,6 +52,7 @@ def test(
    y_stride=None,
    x_stride=None,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
@@ -76,8 +77,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopExpandDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopExpandDescriptor_t()
    check_error(
        lib.infiniopCreateExpandDescriptor(
            handle,

--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
@@ -83,6 +83,7 @@ def test(
    b_stride=None,
    c_stride=None,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
@@ -104,6 +105,9 @@ def test(
    ]
    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]

+    if sync is not None:
+        sync()
+
    descriptor = infiniopGemmDescriptor_t()
    check_error(
        lib.infiniopCreateGemmDescriptor(

--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
@@ -51,6 +51,7 @@ def test(
    torch_device,
    x_shape,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
@@ -70,8 +71,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
    check_error(
        lib.infiniopCreateGlobalAvgPoolDescriptor(
            handle,

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -423,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
                    infiniDeviceEnum_str_map[device],
                    *test_case,
                    tensor_dtype,
+                    get_sync_func(device)
                )
    finally:
        destroy_handle(lib, handle)
@@ -471,3 +472,15 @@ def get_test_devices(args):
        devices_to_test = [InfiniDeviceEnum.CPU]

    return devices_to_test
+
+
+def get_sync_func(device):
+    import torch
+    device_str = infiniDeviceEnum_str_map[device]
+    
+    if device == InfiniDeviceEnum.CPU:
+        sync = None
+    else:
+        sync = getattr(torch, device_str).synchronize
+    
+    return sync
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
@@ -83,6 +83,7 @@ def test(
    padding,
    strides,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
@@ -104,8 +105,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopMaxPoolDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopMaxPoolDescriptor_t()
    check_error(
        lib.infiniopCreateMaxPoolDescriptor(
            handle,

--- a/test/infiniop/mlp.py
+++ b/test/infiniop/mlp.py
@@ -65,6 +65,7 @@ def test(
    y_stride=None,
    w12_stride=None,
    w3_stride=None,
+    sync=None
 ):
    print(
        f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
@@ -97,6 +98,10 @@ def test(
    x_tensor = to_tensor(x, lib)
    w12_tensor = to_tensor(w12, lib)
    w3_tensor = to_tensor(w3, lib)
+    
+    if sync is not None:
+        sync()
+
    descriptor = infiniopMLPDescriptor_t()
    check_error(
        lib.infiniopCreateMLPDescriptor(

--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -103,6 +103,7 @@ def test(
    topk,
    temperature,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
@@ -122,6 +123,9 @@ def test(

    indices_tensor.descriptor.contents.dt = InfiniDtype.U64  # treat int64 as uint64

+    if sync is not None:
+        sync()
+
    descriptor = infiniopRandomSampleDescriptor_t()
    check_error(
        lib.infiniopCreateRandomSampleDescriptor(

--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -131,6 +131,7 @@ def test(
    x_stride,
    y_stride,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
@@ -145,6 +146,9 @@ def test(
    ]

    x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
+    
+    if sync is not None:
+        sync()

    descriptor = infiniopRearrangeDescriptor_t()
    check_error(

--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
@@ -55,6 +55,7 @@ def test(
    tensor_shape,
    tensor_dtype=torch.float16,
    inplace=Inplace.OUT_OF_PLACE,
+    sync=None
 ):
    print(
        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
@@ -78,8 +79,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
-    descriptor = infiniopReluDescriptor_t()

+    if sync is not None:
+        sync()    
+
+    descriptor = infiniopReluDescriptor_t()
    check_error(
        lib.infiniopCreateReluDescriptor(
            handle,