Merge branch 'main' into issue180

c98e68be · goldenfox2025 · GitHub · d7c12d52 · 125afeb5 · c98e68be
Unverified Commit c98e68be authored May 13, 2025 by goldenfox2025 Committed by GitHub May 13, 2025
20 changed files
--- a/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
+++ b/src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
+#ifndef __SWIGLU_KUNLUN_H__
+#define __SWIGLU_KUNLUN_H__
+
+#include "../../../devices/kunlun/kunlun_kernel_common.h"
+#include "../../../elementwise/kunlun/elementwise_kunlun_kernel.h"
+
+/// @brief Define swiglu op for local mem
+typedef struct SwiGLUOp {
+private:
+    template <typename T>
+    inline __device__ T sigmoid(T x) const {
+        return 1.0f / (1.0f + exp(-x));
+    }
+
+public:
+    // This static number must be set in other Ops
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    inline __device__ T operator()(const T *inputs) const {
+        T up = inputs[0];
+        T gate = inputs[1];
+        T out = gate * sigmoid(gate) * up;
+        return out;
+    }
+} SwiGLUOp;
+
+// Definition for swiglu kernel interface
+LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
+
+// Template instantiate
+LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
+
+#endif // __SWIGLU_KUNLUN_H__
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -8,6 +8,9 @@
 #ifdef ENABLE_CUDA_API
 #include "cuda/swiglu_cuda.cuh"
 #endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/swiglu_kunlun.h"
+#endif

 __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
    infiniopHandle_t handle,
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
 #ifdef ENABLE_CUDA_API
        CREATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
        return bangCreateSwiGLUDescriptor((BangHandle_t)handle,
@@ -80,6 +86,9 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
 #ifdef ENABLE_CUDA_API
        GET(INFINI_DEVICE_NVIDIA, cuda)
 #endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun)
+#endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
        return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
@@ -127,6 +136,9 @@ __C infiniStatus_t infiniopSwiGLU(
 #ifdef ENABLE_CUDA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
        return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
@@ -168,6 +180,9 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
 #ifdef ENABLE_CUDA_API
        DELETE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
        return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);

--- a/src/infiniop/reduce/kunlun/reduce_kunlun.h
+++ b/src/infiniop/reduce/kunlun/reduce_kunlun.h
 #ifndef __INFINIOP_REDUCE_KUNLUN_H__
 #define __INFINIOP_REDUCE_KUNLUN_H__

-#include "../../devices/kunlun/kunlun_common.h"
+#include "../../devices/kunlun/kunlun_kernel_common.h"

 namespace op::common_kunlun::reduce_op {

+using namespace device::kunlun::kernel;
+
 // Use 16 floats instruction to calculate reduce
 // data_ptr is the pointer of LM
 static inline __device__ float sumSquaredF32(float *data_ptr, int count) {

--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
@@ -4,6 +4,7 @@
 #include "bang/infinirt_bang.h"
 #include "cpu/infinirt_cpu.h"
 #include "cuda/infinirt_cuda.cuh"
+#include "kunlun/infinirt_kunlun.h"
 #include "maca/infinirt_maca.h"
 #include "musa/infinirt_musa.h"

@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
        case INFINI_DEVICE_MOORE:                                      \
            _status = infinirt::musa::API PARAMS;                      \
            break;                                                     \
+        case INFINI_DEVICE_KUNLUN:                                     \
+            _status = infinirt::kunlun::API PARAMS;                    \
+            break;                                                     \
        default:                                                       \
-            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;            \
+            _status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;         \
        }                                                              \
        { ACTION; }                                                    \
        return _status;                                                \

--- a/test/infiniop-test/test_generate/testcases/swiglu.py
+++ b/test/infiniop-test/test_generate/testcases/swiglu.py
+import numpy as np
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+
+
+def swiglu(
+    a: np.ndarray,
+    b: np.ndarray,
+):
+    c = a * b / (1.0 + np.exp(-b))
+
+    return c
+
+
+class SwiGLUTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        stride_c: List[int] | None,
+    ):
+        super().__init__("swiglu")
+        self.a = a
+        self.stride_a = stride_a
+        self.b = b
+        self.stride_b = stride_b
+        self.c = c
+        self.stride_c = stride_c
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
+        if self.stride_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        ans = swiglu(
+            self.a.astype(np.float64),
+            self.b.astype(np.float64),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("swiglu.gguf")
+    test_cases = [
+        SwiGLUTestCase(
+            np.random.rand(64, 128).astype(np.float32),
+            None,
+            np.random.rand(64, 128).astype(np.float32),
+            None,
+            np.random.rand(64, 128).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(64, 121).astype(np.float32),
+            None,
+            np.random.rand(64, 121).astype(np.float32),
+            None,
+            np.random.rand(64, 121).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(15, 512).astype(np.float32),
+            None,
+            np.random.rand(15, 512).astype(np.float32),
+            None,
+            np.random.rand(15, 512).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float32),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float32),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float32),
+            gguf_strides(10, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4).astype(np.float16),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float16),
+            gguf_strides(10, 1),
+            np.random.rand(13, 4).astype(np.float16),
+            gguf_strides(10, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float32),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float16),
+            None,
+            np.random.rand(13, 4, 4).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float32),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float32),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float32),
+            gguf_strides(20, 4, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(13, 4, 4).astype(np.float16),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float16),
+            gguf_strides(20, 4, 1),
+            np.random.rand(13, 4, 4).astype(np.float16),
+            gguf_strides(20, 4, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float32),
+            None,
+            np.random.rand(16, 5632).astype(np.float32),
+            None,
+            np.random.rand(16, 5632).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float16),
+            None,
+            np.random.rand(16, 5632).astype(np.float16),
+            None,
+            np.random.rand(16, 5632).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(13312, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(13312, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(13312, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float32),
+            gguf_strides(1, 16),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(5632, 1),
+            np.random.rand(16, 5632).astype(np.float16),
+            gguf_strides(1, 16),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(2, 3, 400).astype(np.float32),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float32),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float32),
+            gguf_strides(1, 2, 6),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(2, 3, 400).astype(np.float16),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float16),
+            gguf_strides(1200, 400, 1),
+            np.random.rand(2, 3, 400).astype(np.float16),
+            gguf_strides(1, 2, 6),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            None,
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            None,
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float32),
+            gguf_strides(45056, 5632, 1),
+        ),
+        SwiGLUTestCase(
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            gguf_strides(45056, 5632, 1),
+            np.random.rand(4, 4, 5632).astype(np.float16),
+            gguf_strides(45056, 5632, 1),
+        ),
+    ]
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
@@ -101,6 +101,7 @@ def test(
    v_stride=None,
    k_cache_stride=None,
    v_cache_stride=None,
+    sync=None
 ):
    print(
        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
@@ -139,6 +140,9 @@ def test(
    v_tensor = to_tensor(v, lib)
    k_cache_tensor = to_tensor(k_cache, lib)
    v_cache_tensor = to_tensor(v_cache, lib)
+    
+    if sync is not None:
+        sync()

    descriptor = infiniopAttentionDescriptor_t()
    check_error(

--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
@@ -88,6 +88,7 @@ def test(
    padding,
    strides,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
@@ -109,6 +110,10 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
+    
+    if sync is not None:
+        sync()
+
    descriptor = infiniopAvgPoolDescriptor_t()

    check_error(

--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -87,6 +87,7 @@ def test(
    y_stride=None,
    inplace=Inplace.OUT_OF_PLACE,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
@@ -107,6 +108,9 @@ def test(
        y = torch.zeros(shape, dtype=dtype).to(torch_device)
        y = rearrange_if_needed(y, y_stride)
        y_tensor = to_tensor(y, lib)
+        
+    if sync is not None:
+        sync()

    descriptor = infiniopCausalSoftmaxDescriptor_t()
    check_error(

--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -95,6 +95,7 @@ def test(
    dilations,
    tensor_stride=None,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    assert len(pads) == len(strides) == len(dilations)
    print(
@@ -118,8 +119,11 @@ def test(
    x_tensor = to_tensor(x, lib)
    w_tensor = to_tensor(w, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopConvDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopConvDescriptor_t()
    check_error(
        lib.infiniopCreateConvDescriptor(
            handle,

--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
@@ -52,6 +52,7 @@ def test(
    y_stride=None,
    x_stride=None,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
@@ -76,8 +77,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopExpandDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopExpandDescriptor_t()
    check_error(
        lib.infiniopCreateExpandDescriptor(
            handle,

--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
@@ -83,6 +83,7 @@ def test(
    b_stride=None,
    c_stride=None,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
@@ -104,6 +105,9 @@ def test(
    ]
    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]

+    if sync is not None:
+        sync()
+
    descriptor = infiniopGemmDescriptor_t()
    check_error(
        lib.infiniopCreateGemmDescriptor(

--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
@@ -51,6 +51,7 @@ def test(
    torch_device,
    x_shape,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
@@ -70,8 +71,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
    check_error(
        lib.infiniopCreateGlobalAvgPoolDescriptor(
            handle,

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -423,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
                    infiniDeviceEnum_str_map[device],
                    *test_case,
                    tensor_dtype,
+                    get_sync_func(device)
                )
    finally:
        destroy_handle(lib, handle)
@@ -471,3 +472,14 @@ def get_test_devices(args):
        devices_to_test = [InfiniDeviceEnum.CPU]

    return devices_to_test
+
+
+def get_sync_func(device):
+    import torch
+    
+    if device == "cpu":
+        sync = None
+    else:
+        sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
+    
+    return sync
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
@@ -83,6 +83,7 @@ def test(
    padding,
    strides,
    tensor_dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
@@ -104,8 +105,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopMaxPoolDescriptor_t()
+    
+    if sync is not None:
+        sync()

+    descriptor = infiniopMaxPoolDescriptor_t()
    check_error(
        lib.infiniopCreateMaxPoolDescriptor(
            handle,

--- a/test/infiniop/mlp.py
+++ b/test/infiniop/mlp.py
@@ -65,6 +65,7 @@ def test(
    y_stride=None,
    w12_stride=None,
    w3_stride=None,
+    sync=None
 ):
    print(
        f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
@@ -97,6 +98,10 @@ def test(
    x_tensor = to_tensor(x, lib)
    w12_tensor = to_tensor(w12, lib)
    w3_tensor = to_tensor(w3, lib)
+    
+    if sync is not None:
+        sync()
+
    descriptor = infiniopMLPDescriptor_t()
    check_error(
        lib.infiniopCreateMLPDescriptor(

--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -103,6 +103,7 @@ def test(
    topk,
    temperature,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
@@ -122,6 +123,9 @@ def test(

    indices_tensor.descriptor.contents.dt = InfiniDtype.U64  # treat int64 as uint64

+    if sync is not None:
+        sync()
+
    descriptor = infiniopRandomSampleDescriptor_t()
    check_error(
        lib.infiniopCreateRandomSampleDescriptor(

--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -131,6 +131,7 @@ def test(
    x_stride,
    y_stride,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
@@ -145,6 +146,9 @@ def test(
    ]

    x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
+    
+    if sync is not None:
+        sync()

    descriptor = infiniopRearrangeDescriptor_t()
    check_error(

--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
@@ -55,6 +55,7 @@ def test(
    tensor_shape,
    tensor_dtype=torch.float16,
    inplace=Inplace.OUT_OF_PLACE,
+    sync=None
 ):
    print(
        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
@@ -78,8 +79,11 @@ def test(

    x_tensor = to_tensor(x, lib)
    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
-    descriptor = infiniopReluDescriptor_t()

+    if sync is not None:
+        sync()    
+
+    descriptor = infiniopReluDescriptor_t()
    check_error(
        lib.infiniopCreateReluDescriptor(
            handle,

--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -72,6 +72,7 @@ def test(
    x_stride,
    w_dtype=torch.float16,
    dtype=torch.float16,
+    sync=None
 ):
    print(
        f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
@@ -89,9 +90,11 @@ def test(
        rearrange_if_needed(tensor, stride)
        for tensor, stride in zip([x, y], [x_stride, y_stride])
    ]
-
    x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
-
+    
+    if sync is not None:
+        sync()
+    
    descriptor = infiniopRMSNormDescriptor_t()

    check_error(

--- a/test/infiniop/rope.py
+++ b/test/infiniop/rope.py
@@ -117,6 +117,7 @@ def test(
    y_strides=None,
    inplace=Inplace.OUT_OF_PLACE,
    dtype=torch.float32,
+    sync=None
 ):
    if inplace == Inplace.INPLACE_X:
        y_strides = x_strides
@@ -147,8 +148,8 @@ def test(
    else:
        y_tensor = to_tensor(y, lib)

-    if torch_device == "npu":
-        synchronize_device(torch_device)
+    if sync is not None:
+        sync()

    check_error(
        lib.infiniopCreateRoPEDescriptor(