Merge remote-tracking branch 'upstream/main'

2f2a74b6 · Zimin Li · 1d95ddf3 · 70806eed · 2f2a74b6 · 2f2a74b6
Commit 2f2a74b6 authored Mar 24, 2025 by Zimin Li
20 changed files
--- a/src/infiniop/ops/random_sample/random_sample.h
+++ b/src/infiniop/ops/random_sample/random_sample.h
+#ifndef __RANDOM_SAMPLE_H__
+#define __RANDOM_SAMPLE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::random_sample::NAMESPACE {              \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        infiniDtype_t _dt_i, _dt_p;                       \
+        size_t _n, _min_workspace_size;                   \
+                                                          \
+        Descriptor(                                       \
+            infiniDtype_t dt_i,                           \
+            infiniDtype_t dt_p,                           \
+            size_t n,                                     \
+            size_t min_workspace_size,                    \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _dt_i(dt_i),                                \
+              _dt_p(dt_p),                                \
+              _n(n),                                      \
+              _min_workspace_size(min_workspace_size) {}  \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t result_desc,       \
+            infiniopTensorDescriptor_t probs_desc);       \
+                                                          \
+        size_t minWorkspaceSize() const;                  \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *result,                                 \
+            const void *probs,                            \
+            float random_val,                             \
+            float topp,                                   \
+            int topk,                                     \
+            float temperature,                            \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __RANDOM_SAMPLE_H__
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+#include "../../../devices/cuda/cuda_common.cuh"
+#include "rms_norm_cuda.cuh"
+#include "rms_norm_kernel.cuh"
+#include <memory>
+#include <stdint.h>
+
+namespace op::rms_norm::cuda {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::cuda::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon) {
+    RMSNormInfo info;
+    CHECK_STATUS(createRMSNormInfo(&info, y_desc, x_desc, w_desc, epsilon));
+
+    // only support contiguous last dimension
+    if (info.x_strides[1] != 1 || info.y_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+// launch kernel with different data types
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t batch_size, size_t dim,
+    void *y, infiniDtype_t atype, ptrdiff_t stride_y,
+    const void *x, ptrdiff_t stride_x,
+    const void *w, infiniDtype_t wtype,
+    float epsilon,
+    cudaStream_t cuda_stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
+    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                               \
+        stride_y,                                                                                   \
+        reinterpret_cast<const Tdata *>(x),                                                         \
+        stride_x,                                                                                   \
+        reinterpret_cast<const Tweight *>(w),                                                       \
+        dim,                                                                                        \
+        epsilon)
+
+    if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, half, float);
+    } else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(half, float, float);
+    } else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float, float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y, const void *x, const void *w, void *stream) {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto stride_x = _info.x_strides[0];
+    auto stride_y = _info.y_strides[0];
+    auto dim = _info.dim();
+    uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // launch kernel with different block sizes
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::rms_norm::cuda
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+#ifndef __RMS_NORM_CUDA_H__
+#define __RMS_NORM_CUDA_H__
+
+#include "../rms_norm.h"
+
+DESCRIPTOR(cuda)
+
+#endif
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+#ifndef __RMS_NORM_CUDA_KERNEL_H__
+#define __RMS_NORM_CUDA_KERNEL_H__
+
+#include "../../../devices/cuda/cuda_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
+INFINIOP_CUDA_KERNEL rmsnormBlock(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    // Each block takes care of a row of continuous data of length dim
+    // Each thread deals with every block_size element in the row
+    auto y_ptr = y + blockIdx.x * stride_y;
+    auto x_ptr = x + blockIdx.x * stride_x;
+    auto w_ptr = w;
+
+    // Block-reduce sum of x^2
+    Tcompute ss = op::common_cuda::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(x_ptr, dim);
+
+    // Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
+    __shared__ Tcompute rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(ss / Tcompute(dim) + epsilon));
+    }
+    __syncthreads();
+
+    for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
+        y_ptr[i] = Tdata(Tcompute(x_ptr[i]) * Tcompute(w_ptr[i]) * rms);
+    }
+}
+
+#endif
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rms_norm_cpu.h"
 #endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/rms_norm_cuda.cuh"
+#endif

 __C infiniStatus_t infiniopCreateRMSNormDescriptor(
    infiniopHandle_t handle,
@@ -28,10 +31,8 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu: {
-        return cudaCreateRMSNormDescriptor((CudaHandle_t)handle, (RMSNormCudaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
-    }
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -76,11 +77,8 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
 #ifdef ENABLE_CPU_API
        GET(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu: {
-        return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t)desc, size);
-    }
-
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -122,11 +120,8 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu: {
-        return cudaRMSNorm((RMSNormCudaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
-    }
-
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -172,11 +167,8 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
 #ifdef ENABLE_CPU_API
        DESTROY(INFINI_DEVICE_CPU, cpu)
 #endif
-#ifdef ENABLE_NV_GPU
-    case DevNvGpu: {
-        return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t)desc);
-    }
-
+#ifdef ENABLE_CUDA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, cuda)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {

--- a/src/infiniop/ops/rms_norm/rms_norm.h
+++ b/src/infiniop/ops/rms_norm/rms_norm.h
@@ -51,6 +51,10 @@ inline infiniStatus_t createRMSNormInfo(RMSNormInfo *info, infiniopTensorDescrip
        return INFINI_STATUS_BAD_TENSOR_SHAPE;
    }

+    if (w_desc->stride(0) != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
    info->shape = std::move(y_desc->shape());
    info->y_strides = std::move(y_desc->strides());
    info->x_strides = std::move(x_desc->strides());

--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
+#ifndef __INFINIOP_REDUCE_CUDA_H__
+#define __INFINIOP_REDUCE_CUDA_H__
+
+#include <cub/block/block_reduce.cuh>
+
+namespace op::common_cuda::reduce_op {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
+    Tcompute ss = 0;
+
+    // Each thread computes its partial sum
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        ss += Tcompute(data_ptr[i] * data_ptr[i]);
+    }
+
+    // Use CUB block-level reduction
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Sum(ss);
+}
+
+} // namespace op::common_cuda::reduce_op
+
+#endif
--- a/src/utils-test/test_rearrange.cc
+++ b/src/utils-test/test_rearrange.cc
@@ -45,28 +45,26 @@ size_t check_equal(
    return fails;
 }

-int test_transpose_2d() {
-    std::vector<size_t> shape = {3, 5};
-    std::vector<ptrdiff_t> strides_a = {5, 1};
-    std::vector<ptrdiff_t> strides_b = {1, 3};
+int test_transpose_any(size_t index, std::vector<size_t> shape, std::vector<ptrdiff_t> strides_a, std::vector<ptrdiff_t> strides_b) {
    auto numel = std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
    std::vector<float> a(numel);
    std::vector<float> b(numel);
-
    for (size_t i = 0; i < numel; i++) {
-        a[i] = i / numel;
+        a[i] = (float)i / numel;
    }

-    utils::rearrange(b.data(), a.data(), shape.data(), strides_b.data(), strides_a.data(), 2, sizeof(float));
-    if (check_equal<float>(a.data(), b.data(), shape, strides_a, strides_b)) {
+    utils::rearrange(b.data(), a.data(), shape.data(), strides_b.data(), strides_a.data(), shape.size(), sizeof(float));
+    auto fails = check_equal<float>(a.data(), b.data(), shape, strides_a, strides_b);
+    if (fails > 0) {
+        std::cout << "test_transpose " << index << " failed" << std::endl;
        return 1;
    } else {
-        std::cout << "test_transpose_2d passed" << std::endl;
+        std::cout << "test_transpose " << index << " passed" << std::endl;
+        return 0;
    }
-
-    return 0;
 }

 int test_rearrange() {
-    return test_transpose_2d();
+    return test_transpose_any(1, {3, 5}, {5, 1}, {1, 3})
+         + test_transpose_any(2, {1, 2048}, {2048, 1}, {2048, 1});
 }
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -17,4 +17,17 @@

 #define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_)

+#define CHECK_DTYPE(DT, ...)                                 \
+    do {                                                     \
+        auto found_supported_dtype = false;                  \
+        for (auto dt : {__VA_ARGS__}) {                      \
+            if (dt == DT) {                                  \
+                found_supported_dtype = true;                \
+                break;                                       \
+            }                                                \
+        }                                                    \
+        CHECK_API_OR(found_supported_dtype, true,            \
+                     return INFINI_STATUS_BAD_TENSOR_DTYPE); \
+    } while (0)
+
 #endif // INFINIUTILS_CHECK_H
--- a/src/utils/rearrange.cc
+++ b/src/utils/rearrange.cc
@@ -46,6 +46,7 @@ std::optional<RearrangeMeta> RearrangeMeta::create(
        }
        return std::abs(a.dst) > std::abs(b.dst);
    });
+    ndim = dims.size();
    // # 合并连续维度
    // ## 合并末尾连续维度到 unit
    for (auto it = dims.rbegin(); it != dims.rend(); ++it) {

--- a/test/infiniop-test/README.md
+++ b/test/infiniop-test/README.md
+# InfiniOP 测例生成
+
+## 介绍
+
+使用 python 脚本生成包含测例的 `.gguf` 文件，并使用 `infiniop-test` 程序进行测试。
+
+## 运行方式
+
+- 编译 `infiniop-test` 程序
+
+```bash
+xmake build infiniop-test
+```
+
+- 生成测例
+
+在`/test/infiniop-test/`目录执行矩阵乘测例生成脚本，执行结束以后会在`/test/infiniop-test/`目录生成`gemm.gguf`测例文件。
+
+```bash
+cd /test/infiniop-test/
+python -m test_generate.testcases.gemm
+```
+
+- 测试测例
+
+打印测试程序用法
+
+```bash
+infiniop-test --help
+```
+
+示例：在CPU上测试`gemm.gguf`测例文件，预热20次，测试1000次。
+
+```bash
+infiniop-test gemm.gguf --cpu --warmup 20 --run 1000
+```
+
+## 自定义测例
+
+### GGUF文件格式
+
+```text
+GGUF File Contents:
+Version: 3
+Number of Meta KVs: 8
+Number of Tensors: 4
+
+Meta KVs:
+Key: general.architecture, Type: GGUF_TYPE_STRING, Value: infiniop-test
+Key: test_count, Type: GGUF_TYPE_UINT64, Value: 1
+Key: test.0.op_name, Type: GGUF_TYPE_STRING, Value: matmul
+Key: test.0.a.strides, Type: GGUF_TYPE_INT32, Value: [1, 5]
+Key: test.0.b.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
+Key: test.0.c.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
+Key: test.0.alpha, Type: GGUF_TYPE_FLOAT32, Value: 1.000000
+Key: test.0.beta, Type: GGUF_TYPE_FLOAT32, Value: 0.000000
+
+Tensor INFOs:
+Name: test.0.a, NDims: 2, Shape: [5, 4], DataType: F32, DataOffset: 0
+Name: test.0.b, NDims: 2, Shape: [6, 5], DataType: F32, DataOffset: 96
+Name: test.0.c, NDims: 2, Shape: [6, 4], DataType: F32, DataOffset: 224
+Name: test.0.ans, NDims: 2, Shape: [6, 4], DataType: F64, DataOffset: 320
+```
+
+- `Meta` 中必须包含 `test_count` ，表示测例数量。
+- 每个测例的 `Meta` 和 `Tensor` 名字以 `test.[id].` 开头，后接具体信息名称。数字 `[id]` 表示测例编号。编号必须为 0 到 test_count-1.
+- `Tensor` 名字接 `.strides` 表示步长，若没有则默认为连续。
--- a/test/infiniop-test/test_generate/__init__.py
+++ b/test/infiniop-test/test_generate/__init__.py
+from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides
--- a/test/infiniop-test/test_generate/infiniop_test.py
+++ b/test/infiniop-test/test_generate/infiniop_test.py
+import gguf
+from typing import List
+import numpy as np
+from gguf import GGMLQuantizationType
+
+
+def np_dtype_to_ggml(tensor_dtype: np.dtype):
+    if tensor_dtype == np.float16:
+        return GGMLQuantizationType.F16
+    elif tensor_dtype == np.float32:
+        return GGMLQuantizationType.F32
+    elif tensor_dtype == np.float64:
+        return GGMLQuantizationType.F64
+    elif tensor_dtype == np.int8:
+        return GGMLQuantizationType.I8
+    elif tensor_dtype == np.int16:
+        return GGMLQuantizationType.I16
+    elif tensor_dtype == np.int32:
+        return GGMLQuantizationType.I32
+    elif tensor_dtype == np.int64:
+        return GGMLQuantizationType.I64
+    else:
+        raise ValueError(
+            "Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
+        )
+
+
+def gguf_strides(*args: int) -> list[int] | None:
+    return list(args)[::-1] if args else None
+
+
+class InfiniopTestCase:
+    op_name: str
+
+    def __init__(self, op_name: str):
+        self.op_name = op_name
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        test_writer.add_string(test_writer.gguf_key("op_name"), self.op_name)
+
+
+class InfiniopTestWriter(gguf.GGUFWriter):
+    _test_cases: List["InfiniopTestCase"]
+    _written_tests = 0
+
+    def __init__(self, filepath):
+        super().__init__(filepath, "infiniop-test")
+        self._test_cases = []
+        self._written_tests = 0
+
+    def add_test(self, test_case: "InfiniopTestCase"):
+        self._test_cases.append(test_case)
+
+    def add_tests(self, test_cases: List["InfiniopTestCase"]):
+        self._test_cases.extend(test_cases)
+
+    def gguf_key(self, name: str) -> str:
+        return f"test.{self._written_tests}.{name}"
+
+    def save(self):
+        super().add_uint64("test_count", len(self._test_cases))
+        for test_case in self._test_cases:
+            test_case.write_test(self)
+            self._written_tests += 1
+        super().write_header_to_file()
+        super().write_kv_data_to_file()
+        super().write_tensors_to_file()
+        super().close()
--- a/test/infiniop-test/test_generate/testcases/__init__.py
+++ b/test/infiniop-test/test_generate/testcases/__init__.py
--- a/test/infiniop-test/test_generate/testcases/gemm.py
+++ b/test/infiniop-test/test_generate/testcases/gemm.py
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+
+
+def gemm(
+    a: np.ndarray,
+    b: np.ndarray,
+    alpha: float = 1.0,
+    c: np.ndarray = None,
+    beta: float = 0.0,
+):
+    if c is None:
+        return alpha * np.matmul(a, b)
+    return alpha * np.matmul(a, b) + beta * c
+
+
+def random_tensor(shape, dtype):
+    rate = 1e-3
+    var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
+    return rate * np.random.rand(*shape).astype(dtype) - var
+
+
+class GemmTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        stride_c: List[int] | None,
+        alpha: float,
+        beta: float,
+    ):
+        super().__init__("gemm")
+        self.a = a
+        self.stride_a = stride_a
+        self.b = b
+        self.stride_b = stride_b
+        self.c = c
+        self.stride_c = stride_c
+        self.alpha = alpha
+        self.beta = beta
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
+        if self.stride_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
+        test_writer.add_float32(test_writer.gguf_key("alpha"), self.alpha)
+        test_writer.add_float32(test_writer.gguf_key("beta"), self.beta)
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        ans = gemm(
+            self.a.astype(np.float64),
+            self.b.astype(np.float64),
+            self.alpha,
+            self.c.astype(np.float64),
+            self.beta,
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("gemm.gguf")
+    # a, stride_a, b, stride_b, c, stride_c, alpha, beta
+    test_cases = [
+        GemmTestCase(
+            random_tensor((4, 5), np.float32),
+            None,
+            random_tensor((5, 6), np.float32),
+            None,
+            random_tensor((4, 6), np.float32),
+            None,
+            1.0,
+            0.0,
+        ),
+        GemmTestCase(
+            random_tensor((4, 5), np.float32),
+            gguf_strides(1, 4),
+            random_tensor((5, 6), np.float32),
+            gguf_strides(1, 5),
+            random_tensor((4, 6), np.float32),
+            gguf_strides(1, 4),
+            1.0,
+            1.0,
+        ),
+        GemmTestCase(
+            random_tensor((4, 5), np.float16),
+            None,
+            random_tensor((5, 6), np.float16),
+            None,
+            random_tensor((4, 6), np.float16),
+            None,
+            1.0,
+            0.0,
+        ),
+        GemmTestCase(
+            random_tensor((4, 5), np.float16),
+            gguf_strides(1, 4),
+            random_tensor((5, 6), np.float16),
+            gguf_strides(1, 5),
+            random_tensor((4, 6), np.float16),
+            gguf_strides(1, 4),
+            1.0,
+            1.0,
+        ),
+        GemmTestCase(
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 2048),
+            random_tensor((2048, 2048), np.float16),
+            gguf_strides(1, 2048),
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 2048),
+            1.0,
+            0.0,
+        ),
+        GemmTestCase(
+            random_tensor((1, 2048), np.float32),
+            None,
+            random_tensor((2048, 2048), np.float32),
+            None,
+            random_tensor((1, 2048), np.float32),
+            None,
+            1.0,
+            0.0,
+        ),
+        GemmTestCase(
+            random_tensor((2, 4, 2048), np.float16),
+            None,
+            random_tensor((2, 2048, 2048), np.float16),
+            None,
+            random_tensor((2, 4, 2048), np.float16),
+            None,
+            1.0,
+            0.0,
+        ),
+        GemmTestCase(
+            random_tensor((2, 4, 2048), np.float32),
+            None,
+            random_tensor((2, 2048, 2048), np.float32),
+            None,
+            random_tensor((2, 4, 2048), np.float32),
+            None,
+            1.0,
+            0.0,
+        ),
+        GemmTestCase(
+            random_tensor((6, 2048), np.float32),
+            gguf_strides(1, 2048),
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(1, 2560),
+            random_tensor((6, 2560), np.float32),
+            gguf_strides(1, 2560),
+            1.0,
+            1.0,
+        ),
+        GemmTestCase(
+            random_tensor((4, 48, 64), np.float16),
+            None,
+            random_tensor((4, 64, 6), np.float16),
+            None,
+            random_tensor((4, 48, 6), np.float16),
+            None,
+            1.0 / 8,
+            1.0,
+        ),
+        GemmTestCase(
+            random_tensor((4, 48, 64), np.float32),
+            None,
+            random_tensor((4, 64, 6), np.float32),
+            None,
+            random_tensor((4, 48, 6), np.float32),
+            None,
+            1.0 / 8,
+            1.0,
+        ),
+    ]
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import torch
 import ctypes
-import sys
-import os
-import time
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
    infiniopHandle_t,
    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
    check_error,
-    rearrange_tensor,
+    rearrange_if_needed,
+    create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )

-from operatorspy.tests.test_utils import get_args
-import torch
-
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
+    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
+    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
+    (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
+    (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
+    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
+    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
+    (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
+    (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
+    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
+    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float32: {"atol": 0, "rtol": 1e-3},
+}
+
+DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000


-class GEMMDescriptor(Structure):
+# ==============================================================================
+#  Definitions
+# ==============================================================================
+class GemmDescriptor(Structure):
    _fields_ = [("device", c_int32)]


-infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
+infiniopGemmDescriptor_t = POINTER(GemmDescriptor)


-def gemm(
-    A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32
-):
-    A = A.T if transA else A
-    B = B.T if transB else B
-    result = alpha * torch.matmul(
-        A if dtype != torch.float16 else A.to(torch.float32),
-        B if dtype != torch.float16 else B.to(torch.float32),
-    ).to(dtype)
-    if C is not None:
-        result += beta * C if dtype != torch.float16 else C.to(torch.float32)
-    if PROFILE:
-        torch.cuda.synchronize()
-    return result
+# PyTorch implementation for matrix multiplication
+def gemm(_c, beta, _a, _b, alpha):
+    a, b, c = _a.clone(), _b.clone(), _c.clone()
+    result_dtype = c.dtype
+    fp32_result = torch.matmul(a.to(torch.float32), b.to(torch.float32))
+    return alpha * fp32_result.to(result_dtype) + beta * c


+# The argument list should be (lib, handle, torch_device, <param list>, dtype)
+# The <param list> should keep the same order as the one specified in _TEST_CASES
 def test(
    lib,
    handle,
    torch_device,
    alpha,
    beta,
-    transA,
-    transB,
    a_shape,
    b_shape,
    c_shape,
-    y_shape,
    a_stride=None,
    b_stride=None,
    c_stride=None,
-    y_stride=None,
    dtype=torch.float16,
 ):
    print(
-        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
-        f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
-        f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
+        f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
+        f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
+        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
    )

+    # Initialize tensors
    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
-    c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
-    y = torch.rand(y_shape, dtype=dtype).to(torch_device)
+    c = torch.ones(c_shape, dtype=dtype).to(torch_device)

-    if a_stride is not None:
-        a = rearrange_tensor(a, a_stride)
-    if b_stride is not None:
-        b = rearrange_tensor(b, b_stride)
-    if c_stride is not None and c is not None:
-        c = rearrange_tensor(c, c_stride)
-    if y_stride is not None:
-        y = rearrange_tensor(y, y_stride)
+    # Compute the PyTorch reference result
+    ans = gemm(c, beta, a, b, alpha)

-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    a, b, c = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
+    ]
+    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]

-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
-    c_tensor = to_tensor(c, lib) if c is not None else None
-    y_tensor = to_tensor(y, lib)
-    descriptor = infiniopGEMMDescriptor_t()
+    descriptor = infiniopGemmDescriptor_t()
    check_error(
-        lib.infiniopCreateGEMMDescriptor(
+        lib.infiniopCreateGemmDescriptor(
            handle,
            ctypes.byref(descriptor),
-            y_tensor.descriptor,
+            c_tensor.descriptor,
            a_tensor.descriptor,
            b_tensor.descriptor,
-            c_tensor.descriptor if c_tensor else None,
-            alpha,
-            beta,
-            transA,
-            transB,
        )
    )

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
-    if c_tensor is not None:
-        c_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
+    for tensor in [a_tensor, b_tensor, c_tensor]:
+        tensor.destroyDesc(lib)

-    workspace_size = ctypes.c_uint64(0)
+    # Get workspace size and create workspace
+    workspace_size = c_uint64(0)
    check_error(
-        lib.infiniopGetGEMMWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size))
    )
-    workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
-        torch_device
-    )
-    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+    workspace = create_workspace(workspace_size.value, a.device)

-    for i in range(NUM_PRERUN if PROFILE else 1):
+    # Execute infiniop gemm operator
+    def lib_gemm():
        check_error(
-            lib.infiniopGEMM(
+            lib.infiniopGemm(
                descriptor,
-                workspace_ptr,
-                workspace_size,
-                y_tensor.data,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data,
                a_tensor.data,
                b_tensor.data,
-                c_tensor.data if c_tensor else None,
+                alpha,
+                beta,
                None,
            )
        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopGEMM(
-                    descriptor,
-                    workspace_ptr,
-                    workspace_size,
-                    y_tensor.data,
-                    a_tensor.data,
-                    b_tensor.data,
-                    c_tensor.data if c_tensor else None,
-                    None,
-                )
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-
-    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
-    check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))

+    lib_gemm()

-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for (
-        alpha,
-        beta,
-        transA,
-        transB,
-        a_shape,
-        b_shape,
-        c_shape,
-        y_shape,
-        a_stride,
-        b_stride,
-        c_stride,
-        y_stride,
-    ) in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
-        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
+    # Validate results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)

-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for (
-        alpha,
-        beta,
-        transA,
-        transB,
-        a_shape,
-        b_shape,
-        c_shape,
-        y_shape,
-        a_stride,
-        b_stride,
-        c_stride,
-        y_stride,
-    ) in test_cases:
-        # fmt: off
-        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
-        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
-        # fmt: on
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-
-    for (
-        alpha,
-        beta,
-        transA,
-        transB,
-        a_shape,
-        b_shape,
-        c_shape,
-        y_shape,
-        a_stride,
-        b_stride,
-        c_stride,
-        y_stride,
-    ) in test_cases:
+    # Profiling workflow
+    if PROFILE:
        # fmt: off
-        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
-        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        profile_operation("PyTorch", lambda: gemm(c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    destroy_handle(lib, handle)
+    check_error(lib.infiniopDestroyGemmDescriptor(descriptor))


+# ==============================================================================
+#  Main Execution
+# ==============================================================================
 if __name__ == "__main__":
-    test_cases = [
-        # alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
-        (
-            1.0,
-            1.0,
-            False,
-            False,
-            (1, 2048),
-            (2048, 2048),
-            (1, 2048),
-            (1, 2048),
-            None,
-            None,
-            None,
-            None,
-        ),
-        (
-            1.0,
-            1.0,
-            True,
-            True,
-            (2048, 4),
-            (2048, 2048),
-            (4, 2048),
-            (4, 2048),
-            None,
-            None,
-            None,
-            None,
-        ),
-        (
-            1.0,
-            1.0,
-            False,
-            True,
-            (1, 2048),
-            (1000, 2048),
-            (1000),
-            (1, 1000),
-            None,
-            None,
-            None,
-            None,
-        ),
-        (
-            1.0,
-            1.0,
-            True,
-            False,
-            (2048, 4),
-            (2048, 2048),
-            (2048),
-            (4, 2048),
-            (4096, 1),
-            (4096, 1),
-            (2,),
-            (4096, 1),
-        ),
-        (
-            1.0,
-            1.0,
-            False,
-            False,
-            (3, 1, 2048),
-            (3, 2048, 2048),
-            (1,),
-            (3, 1, 2048),
-            None,
-            None,
-            None,
-            None,
-        ),
-        (
-            1.0,
-            1.0,
-            True,
-            False,
-            (2048, 4),
-            (2048, 2048),
-            None,
-            (4, 2048),
-            (4096, 1),
-            (4096, 1),
-            (2,),
-            (4096, 1),
-        ),
-    ]
    args = get_args()
    lib = open_lib()

-    lib.infiniopCreateGEMMDescriptor.restype = c_int32
-    lib.infiniopCreateGEMMDescriptor.argtypes = [
+    lib.infiniopCreateGemmDescriptor.restype = c_int32
+    lib.infiniopCreateGemmDescriptor.argtypes = [
        infiniopHandle_t,
-        POINTER(infiniopGEMMDescriptor_t),
+        POINTER(infiniopGemmDescriptor_t),
        infiniopTensorDescriptor_t,
        infiniopTensorDescriptor_t,
        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        c_float,
-        c_float,
-        c_bool,
-        c_bool,
    ]

-    lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
-    lib.infiniopGetGEMMWorkspaceSize.argtypes = [
-        infiniopGEMMDescriptor_t,
-        POINTER(c_uint64),
+    lib.infiniopGetGemmWorkspaceSize.restype = c_int32
+    lib.infiniopGetGemmWorkspaceSize.argtypes = [
+        infiniopGemmDescriptor_t,
+        POINTER(c_size_t),
    ]

-    lib.infiniopGEMM.restype = c_int32
-    lib.infiniopGEMM.argtypes = [
-        infiniopGEMMDescriptor_t,
+    lib.infiniopGemm.restype = c_int32
+    lib.infiniopGemm.argtypes = [
+        infiniopGemmDescriptor_t,
        c_void_p,
        c_uint64,
        c_void_p,
        c_void_p,
        c_void_p,
-        c_void_p,
+        c_float,
+        c_float,
        c_void_p,
    ]

-    lib.infiniopDestroyGEMMDescriptor.restype = c_int32
-    lib.infiniopDestroyGEMMDescriptor.argtypes = [
-        infiniopGEMMDescriptor_t,
+    lib.infiniopDestroyGemmDescriptor.restype = c_int32
+    lib.infiniopDestroyGemmDescriptor.argtypes = [
+        infiniopGemmDescriptor_t,
    ]

-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/matmul.py
+++ b/test/infiniop/matmul.py
-import torch
-import ctypes
-from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
-from libinfiniop import (
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    open_lib,
-    to_tensor,
-    get_test_devices,
-    check_error,
-    rearrange_if_needed,
-    create_workspace,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES = [
-    # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
-    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
-    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
-    (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
-    (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
-    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
-    (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
-    (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
-    (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
-    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
-    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
-]
-
-# Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
-    torch.float32: {"atol": 0, "rtol": 1e-3},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-# ==============================================================================
-#  Definitions
-# ==============================================================================
-class MatmulDescriptor(Structure):
-    _fields_ = [("device", c_int32)]
-
-
-infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor)
-
-
-# PyTorch implementation for matrix multiplication
-def matmul(_c, beta, _a, _b, alpha):
-    a, b, c = _a.clone(), _b.clone(), _c.clone()
-    result_dtype = c.dtype
-    fp32_result = torch.matmul(a.to(torch.float32), b.to(torch.float32))
-    return alpha * fp32_result.to(result_dtype) + beta * c
-
-
-# The argument list should be (lib, handle, torch_device, <param list>, dtype)
-# The <param list> should keep the same order as the one specified in _TEST_CASES
-def test(
-    lib,
-    handle,
-    torch_device,
-    alpha,
-    beta,
-    a_shape,
-    b_shape,
-    c_shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    dtype=torch.float16,
-):
-    print(
-        f"Testing Matmul on {torch_device} with alpha:{alpha}, beta:{beta},"
-        f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
-        f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
-    )
-
-    # Initialize tensors
-    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
-    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
-    c = torch.ones(c_shape, dtype=dtype).to(torch_device)
-
-    # Compute the PyTorch reference result
-    ans = matmul(c, beta, a, b, alpha)
-
-    a, b, c = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
-    ]
-    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
-
-    descriptor = infiniopMatmulDescriptor_t()
-    check_error(
-        lib.infiniopCreateMatmulDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor, c_tensor]:
-        tensor.destroyDesc(lib)
-
-    # Get workspace size and create workspace
-    workspace_size = c_uint64(0)
-    check_error(
-        lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
-    )
-    workspace = create_workspace(workspace_size.value, a.device)
-
-    # Execute infiniop matmul operator
-    def lib_matmul():
-        check_error(
-            lib.infiniopMatmul(
-                descriptor,
-                workspace.data_ptr() if workspace is not None else None,
-                workspace_size.value,
-                c_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
-                alpha,
-                beta,
-                None,
-            )
-        )
-
-    lib_matmul()
-
-    # Validate results
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(c, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: matmul(c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_matmul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
-
-
-# ==============================================================================
-#  Main Execution
-# ==============================================================================
-if __name__ == "__main__":
-    args = get_args()
-    lib = open_lib()
-
-    lib.infiniopCreateMatmulDescriptor.restype = c_int32
-    lib.infiniopCreateMatmulDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopMatmulDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetMatmulWorkspaceSize.restype = c_int32
-    lib.infiniopGetMatmulWorkspaceSize.argtypes = [
-        infiniopMatmulDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopMatmul.restype = c_int32
-    lib.infiniopMatmul.argtypes = [
-        infiniopMatmulDescriptor_t,
-        c_void_p,
-        c_uint64,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_float,
-        c_float,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyMatmulDescriptor.restype = c_int32
-    lib.infiniopDestroyMatmulDescriptor.argtypes = [
-        infiniopMatmulDescriptor_t,
-    ]
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    # Execute tests
-    for device in get_test_devices(args):
-        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -82,25 +82,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
        globalM = dataNp[0]
        dataNp = (dataNp - globalM) / temperature
        dataNp = torch.softmax(dataNp.float(), dim=0)
-        sum_s = 0
-        for end in range(topk):
-            sum_s += dataNp[end]
-            if sum_s >= topp:
-                break
-        if end < topk - 1:
-            end += 1
-        else:
-            end = topk
-
-        sum_s = 0
-        for i in range(end):
-            sum_s += dataNp[i]
-        random_val *= sum_s
-
-        sum_s = 0
-        for i in range(end):
-            sum_s += dataNp[i]
-            if random_val < sum_s:
+        for i in range(1, voc):
+            dataNp[i] += dataNp[i - 1]
+        limit_k = dataNp[min(topk, voc) - 1]
+        limit_p = dataNp[voc - 1] * topp
+        limit = min(limit_k, limit_p) * random_val
+
+        for i in range(voc):
+            if limit < dataNp[i]:
                return indices[i]
    else:
        return torch.argmax(data)
@@ -129,7 +118,7 @@ def test(
        data, random_val, topp, topk, voc, temperature
    )  # 这个函数在device速度可能会很慢，可以通过data.to("cpu")方式加快计算过程

-    indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
+    indices = torch.zeros([], dtype=torch.int64).to(torch_device)

    x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]

@@ -147,7 +136,7 @@ def test(

    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
    for tensor in [x_tensor, indices_tensor]:
-        tensor.descriptor.contents.invalidate()
+        tensor.destroyDesc(lib)

    workspace_size = c_uint64(0)
    check_error(
@@ -181,13 +170,13 @@ def test(
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
        debug_all(
-            (indices[0].type(ans.dtype), data[indices[0]]),
+            (indices.type(ans.dtype), data[indices]),
            (ans, data[ans]),
            "or",
            atol=atol,
            rtol=rtol,
        )
-    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
+    assert indices.type(ans.dtype) == ans or data[ans] == data[indices]

    # Profiling workflow
    if PROFILE:

--- a/xmake.lua
+++ b/xmake.lua
@@ -145,7 +145,7 @@ target("infinirt")
    set_languages("cxx17")
    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
    add_files("src/infinirt/*.cc")
-    add_installfiles("include/infinirt.h")
+    add_installfiles("include/infinirt.h", {prefixdir = "include"})
 target_end()

 target("infiniop")
@@ -197,8 +197,12 @@ target("infiniop")
    add_installfiles("include/infinicore.h", {prefixdir = "include"})
 target_end()

+
 target("all")
    set_kind("phony")
    add_deps("infiniop", "infinirt")
    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
 target_end()
+
+-- Tests
+includes("xmake/test.lua")
--- a/xmake/test.lua
+++ b/xmake/test.lua
@@ -8,4 +8,27 @@ target("infiniutils-test")
    
    add_files(os.projectdir().."/src/utils-test/*.cc")

+target("infiniop-test")
+    set_kind("binary")
+    add_deps("infini-utils")
+    on_install(function (target) end)
+    set_default(false)
+
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    
+    add_includedirs(INFINI_ROOT.."/include")
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infiniop", "infinirt")
+
+    if has_config("omp") then
+        add_cxflags("-fopenmp")
+        add_ldflags("-fopenmp")
+    end
+    
+    add_includedirs(os.projectdir().."/src/infiniop-test/include")
+    add_files(os.projectdir().."/src/infiniop-test/src/*.cpp")
+    add_files(os.projectdir().."/src/infiniop-test/src/ops/*.cpp")
 target_end()