issue/87/refactor: 修改所有 handle 和 matmul op 命名空间，并重构 cuda handle

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/87/refactor: 修改所有 handle 和 matmul op 命名空间，并重构 cuda handle
Signed-off-by: YdrMaster <ydrml@hotmail.com>
63d9c4a1 · YdrMaster · 601defcb · 63d9c4a1 · 63d9c4a1 · 601defcb
Commit 63d9c4a1 authored Mar 05, 2025 by YdrMaster
17 changed files
--- a/src/infiniop/devices/cpu/cpu_handle.cc
+++ b/src/infiniop/devices/cpu/cpu_handle.cc
 #include "cpu_handle.h"
-namespace infiniop::cpu {
+namespace device::cpu {
 Handle::Handle() : InfiniopHandle{INFINI_DEVICE_CPU, 0} {}
-infiniStatus_t Handle::create(InfiniopHandle **handle_ptr) {
+infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int) {
    *handle_ptr = new Handle{};
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace infiniop::cpu
+} // namespace device::cpu
--- a/src/infiniop/devices/cpu/cpu_handle.h
+++ b/src/infiniop/devices/cpu/cpu_handle.h
@@ -3,13 +3,15 @@
 #include "../../handle.h"
-namespace infiniop::cpu {
+namespace device::cpu {
 class Handle : public InfiniopHandle {
    Handle();
 public:
-    static infiniStatus_t create(InfiniopHandle **handle_ptr);
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int);
 };
-} // namespace infiniop::cpu
+} // namespace device::cpu
 #endif
--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
-#ifndef __INFINIOP_COMMON_CUDA_H__
-#define __INFINIOP_COMMON_CUDA_H__
-#define MAX_THREADS_PER_BLOCK 1024
-#define MAX_WARP_PER_BLOCK 32
-#define WARP_SIZE 32
-#include "../../../utils.h"
-#include <iostream>
-#define CHECK_CUDA_OR_RETURN(API, ERROR) CHECK_API_OR(API, cudaSuccess, return ERROR)
-#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
-#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
-#include "../pool.h"
-#include "cuda_handle.h"
-#include "infinicore.h"
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#include <cudnn.h>
-#include <memory>
-struct InfiniopCudaHandle {
-    infiniDevice_t device;
-    int device_id;
-    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
-    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool;
-    cudaDeviceProp prop;
-    int compute_capability_major;
-    int compute_capability_minor;
-};
-template <typename T>
-void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> &pool, cudaStream_t stream, const T &f) {
-    auto handle = pool->pop();
-    if (!handle) {
-        cublasCreate(&(*handle));
-    }
-    cublasSetStream(*handle, stream);
-    f(*handle);
-    pool->push(std::move(*handle));
-}
-template <typename T>
-void use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> &pool, cudaStream_t stream, const T &f) {
-    auto handle = pool->pop();
-    if (!handle) {
-        cudnnCreate(&(*handle));
-    }
-    cudnnSetStream(*handle, stream);
-    f(*handle);
-    pool->push(std::move(*handle));
-}
-inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
-    switch (dt) {
-    case INFINI_DTYPE_F16:
-        return CUDNN_DATA_HALF;
-    case INFINI_DTYPE_F32:
-        return CUDNN_DATA_FLOAT;
-    case INFINI_DTYPE_F64:
-        return CUDNN_DATA_DOUBLE;
-    case INFINI_DTYPE_BF16:
-        return CUDNN_DATA_BFLOAT16;
-    case INFINI_DTYPE_I8:
-        return CUDNN_DATA_INT8;
-    case INFINI_DTYPE_I32:
-        return CUDNN_DATA_INT32;
-    case INFINI_DTYPE_I64:
-        return CUDNN_DATA_INT64;
-    case INFINI_DTYPE_U8:
-        return CUDNN_DATA_UINT8;
-    default:
-        return CUDNN_DATA_FLOAT;
-    }
-}
-// return the memory offset of original tensor, given the flattened index of
-// broadcasted tensor
-inline __device__ __host__ size_t indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-// get the memory offset of the given element in a tensor given its flat index
-inline __device__ __host__ size_t indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-#endif // __INFINIOP_COMMON_CUDA_H__
--- a/src/infiniop/devices/cuda/cuda_handle.cu
+++ b/src/infiniop/devices/cuda/cuda_handle.cu
-#include "common_cuda.cuh"
+#include "cuda_handle.cuh"
-infiniStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type) {
-    // Create a new cublas handle pool
-    int device_id = 0;
-    CHECK_CUDA_OR_RETURN(cudaGetDevice(&device_id), INFINI_STATUS_DEVICE_NOT_INITIALIZED);
-    auto pool = std::make_shared<Pool<cublasHandle_t>>();
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-    pool->push(std::move(handle));
-    // create a cudnn handle pool
-    auto cudnn_pool = std::make_shared<Pool<cudnnHandle_t>>();
-    cudnnHandle_t cudnn_handle;
-    CHECK_CUDNN(cudnnCreate(&cudnn_handle));
-    cudnn_pool->push(std::move(cudnn_handle));
-    // set CUDA device property
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, device_id);
-    // set device compute capability numbers
-    int capability_major;
-    int capability_minor;
-    cudaDeviceGetAttribute(&capability_major, cudaDevAttrComputeCapabilityMajor, device_id);
-    cudaDeviceGetAttribute(&capability_minor, cudaDevAttrComputeCapabilityMinor, device_id);
-    *handle_ptr = new InfiniopCudaHandle{
-        cuda_device_type,
-        device_id,
-        std::move(pool),
-        std::move(cudnn_pool),
-        std::move(prop),
-        capability_major,
-        capability_minor,
-    };
-    return INFINI_STATUS_SUCCESS;
+namespace device::cuda {
+Handle::Handle(infiniDevice_t device, int device_id)
+    : InfiniopHandle{device, device_id},
+      _internal(std::make_shared<Handle::Internal>()) {}
+auto Handle::internal() const -> const std::shared_ptr<Internal> & {
+    return _internal;
 }
-infiniStatus_t destroyCudaHandle(infiniopCudaHandle_t handle_ptr) {
+template <typename T>
-    handle_ptr->cublas_handle_pool = nullptr;
+using Fn = std::function<void(T)>;
-    handle_ptr->cudnn_handle_pool = nullptr;
-    delete handle_ptr;
+void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
+    auto handle = blas_handles.pop();
+    if (!handle) {
+        cublasCreate(&(*handle));
+    }
+    cublasSetStream(*handle, stream);
+    f(*handle);
+    blas_handles.push(std::move(*handle));
+}
+void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
+    auto handle = dnn_handles.pop();
+    if (!handle) {
+        cudnnCreate(&(*handle));
+    }
+    cudnnSetStream(*handle, stream);
+    f(*handle);
+    dnn_handles.push(std::move(*handle));
+}
+cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
+    switch (dt) {
+    case INFINI_DTYPE_F16:
+        return CUDNN_DATA_HALF;
+    case INFINI_DTYPE_F32:
+        return CUDNN_DATA_FLOAT;
+    case INFINI_DTYPE_F64:
+        return CUDNN_DATA_DOUBLE;
+    case INFINI_DTYPE_BF16:
+        return CUDNN_DATA_BFLOAT16;
+    case INFINI_DTYPE_I8:
+        return CUDNN_DATA_INT8;
+    case INFINI_DTYPE_I32:
+        return CUDNN_DATA_INT32;
+    case INFINI_DTYPE_I64:
+        return CUDNN_DATA_INT64;
+    case INFINI_DTYPE_U8:
+        return CUDNN_DATA_UINT8;
+    default:
+        return CUDNN_DATA_FLOAT;
+    }
+}
+namespace nvidia {
+Handle::Handle(int device_id)
+    : cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
+infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
+    *handle_ptr = new Handle(device_id);
    return INFINI_STATUS_SUCCESS;
 }
+} // namespace nvidia
+} // namespace device::cuda
--- a/src/infiniop/devices/cuda/cuda_handle.cuh
+++ b/src/infiniop/devices/cuda/cuda_handle.cuh
+#ifndef __INFINIOP_CUDA_INTERNAL_H__
+#define __INFINIOP_CUDA_INTERNAL_H__
+#include "../pool.h"
+#include "cuda_handle.h"
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <functional>
+namespace device::cuda {
+class Handle::Internal {
+    Pool<cublasHandle_t> blas_handles;
+    Pool<cudnnHandle_t> dnn_handles;
+public:
+    void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const;
+    void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
+};
+cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
+// return the memory offset of original tensor, given the flattened index of broadcasted tensor
+__forceinline__ __device__ __host__ size_t
+indexToReducedOffset(
+    size_t flat_index,
+    size_t ndim,
+    const ptrdiff_t *broadcasted_strides,
+    const ptrdiff_t *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i] * target_strides[i];
+        flat_index %= broadcasted_strides[i];
+    }
+    return res;
+}
+// get the memory offset of the given element in a tensor given its flat index
+__forceinline__ __device__ __host__ size_t
+indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const size_t *shape,
+    const ptrdiff_t *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- > 0;) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+} // namespace device::cuda
+#endif // __INFINIOP_CUDA_INTERNAL_H__
--- a/src/infiniop/devices/cuda/cuda_handle.h
+++ b/src/infiniop/devices/cuda/cuda_handle.h
@@ -2,12 +2,32 @@
 #define __INFINIOP_CUDA_HANDLE_H__
 #include "../../handle.h"
+#include <memory>
-struct InfiniopCudaHandle;
+namespace device::cuda {
-typedef struct InfiniopCudaHandle *infiniopCudaHandle_t;
-infiniStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type);
+struct Handle : public InfiniopHandle {
+    class Internal;
+    auto internal() const -> const std::shared_ptr<Internal> &;
-infiniStatus_t destroyCudaHandle(infiniopCudaHandle_t handle_ptr);
+protected:
+    Handle(infiniDevice_t device, int device_id);
-#endif
+private:
+    std::shared_ptr<Internal> _internal;
+};
+namespace nvidia {
+class Handle : public cuda::Handle {
+    Handle(int device_id);
+public:
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
+};
+} // namespace nvidia
+} // namespace device::cuda
+#endif // __INFINIOP_CUDA_HANDLE_H__
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -27,15 +27,16 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
    int device_id;
    CHECK_STATUS(infinirtGetDevice(&device, &device_id));
+#define CREATE(CASE, NAMESPACE) \
+    case CASE:                  \
+        return device::NAMESPACE::Handle::create(handle_ptr, device_id)
    switch (device) {
 #ifdef ENABLE_CPU_API
-    case INFINI_DEVICE_CPU:
+        CREATE(INFINI_DEVICE_CPU, cpu);
-        return infiniop::cpu::Handle::create(handle_ptr);
 #endif
 #ifdef ENABLE_CUDA_API
-    case INFINI_DEVICE_NVIDIA: {
+        CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
-        return createCudaHandle((infiniopCudaHandle_t *)handle_ptr, device);
-    }
 #endif
 #ifdef ENABLE_CAMBRICON_API
    case INFINI_DEVICE_CAMBRICON: {
@@ -52,25 +53,27 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
        return createKunlunHandle((infiniopKunlunHandle_t *)handle_ptr);
    }
 #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+#undef CREATE
 }
 __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
-#define DELETE(CASE, NAMESPACE)                                         \
+#define DELETE(CASE, NAMESPACE)                                       \
-    case CASE:                                                          \
+    case CASE:                                                        \
-        delete reinterpret_cast<infiniop::NAMESPACE::Handle *>(handle); \
+        delete reinterpret_cast<device::NAMESPACE::Handle *>(handle); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
    switch (handle->device) {
 #ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu)
+        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_CUDA_API
-    case INFINI_DEVICE_NVIDIA: {
+        DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
-        return destroyCudaHandle((infiniopCudaHandle_t)handle);
-    }
 #endif
 #ifdef ENABLE_CAMBRICON_API
    case INFINI_DEVICE_CAMBRICON: {
@@ -90,5 +93,6 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
 #undef DELETE
 }
--- a/src/infiniop/ops/matmul/ascend/matmul_ascend.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_ascend.cc
@@ -6,7 +6,7 @@
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/level2/aclnn_gemm.h>
-namespace matmul::ascend {
+namespace op::matmul::ascend {
 struct Descriptor::Opaque {
    mutable aclOpExecutor *executor;
@@ -135,4 +135,4 @@ infiniStatus_t Descriptor::calculate(
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace matmul::ascend
+} // namespace op::matmul::ascend
--- a/src/infiniop/ops/matmul/bang/matmul_bang.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_bang.cc
@@ -3,7 +3,7 @@
 #include "../../../devices/bang/common_bang.h"
 #include <cnnl_extra.h>
-namespace matmul::bang {
+namespace op::matmul::bang {
 struct Descriptor::Opaque {
    cnnlMatMulDescriptor_t op;
@@ -157,4 +157,4 @@ infiniStatus_t Descriptor::calculate(
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace matmul::bang
+} // namespace op::matmul::bang
--- a/src/infiniop/ops/matmul/blas.h
+++ b/src/infiniop/ops/matmul/blas.h
@@ -5,7 +5,8 @@
 #include "../../tensor.h"
 #include <algorithm>
-namespace matmul {
+namespace op::matmul {
 struct BlasMatrix {
    size_t ndim;
    size_t batch;
@@ -118,6 +119,7 @@ struct MatmulInfo {
        k = a_matrix.cols;
    }
 };
-} // namespace matmul
+} // namespace op::matmul
 #endif // __BLAS_H__
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
@@ -2,7 +2,7 @@
 #include "../../../devices/cpu/common_cpu.h"
 #include "../../../devices/cpu/cpu_handle.h"
-namespace matmul::cpu {
+namespace op::matmul::cpu {
 Descriptor::~Descriptor() = default;
@@ -12,7 +12,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    auto handle = reinterpret_cast<infiniop::cpu::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
    auto dtype = c_desc->dtype();
    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
@@ -96,4 +96,4 @@ infiniStatus_t Descriptor::calculate(
    }
 }
-} // namespace matmul::cpu
+} // namespace op::matmul::cpu
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
-#include "../../../devices/cuda/common_cuda.cuh"
+#include "../../../devices/cuda/cuda_handle.cuh"
 #include "matmul_cuda.cuh"
-namespace matmul::cuda {
+namespace op::matmul::cuda {
 struct Descriptor::Opaque {
-    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
+    std::shared_ptr<device::cuda::Handle::Internal> internal;
 };
 Descriptor::~Descriptor() {
@@ -17,7 +17,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    auto handle = reinterpret_cast<infiniopCudaHandle_t>(handle_);
+    auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_);
    auto dtype = c_desc->dtype();
    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
@@ -32,7 +32,7 @@ infiniStatus_t Descriptor::create(
    *desc_ptr = new Descriptor(
        dtype, info, 0,
-        new Opaque{handle->cublas_handle_pool},
+        new Opaque{handle->internal()},
        handle->device, handle->device_id);
    return INFINI_STATUS_SUCCESS;
 }
@@ -76,35 +76,35 @@ infiniStatus_t Descriptor::calculate(
    auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
    auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
-    use_cublas(_opaque->cublas_handle_pool,
+    _opaque->internal->use_cublas(
-               (cudaStream_t)stream,
+        (cudaStream_t)stream,
-               [&](cublasHandle_t handle) {
+        [&](cublasHandle_t handle) {
-                   cublasGemmStridedBatchedEx(
+            cublasGemmStridedBatchedEx(
-                       handle,
+                handle,
-                       op_a,
+                op_a,
-                       op_b,
+                op_b,
-                       static_cast<int>(_info.m),
+                static_cast<int>(_info.m),
-                       static_cast<int>(_info.n),
+                static_cast<int>(_info.n),
-                       static_cast<int>(_info.k),
+                static_cast<int>(_info.k),
-                       &alpha,
+                &alpha,
-                       a,
+                a,
-                       a_type,
+                a_type,
-                       static_cast<int>(_info.a_matrix.ld()),
+                static_cast<int>(_info.a_matrix.ld()),
-                       _info.a_matrix.stride,
+                _info.a_matrix.stride,
-                       b,
+                b,
-                       b_type,
+                b_type,
-                       static_cast<int>(_info.b_matrix.ld()),
+                static_cast<int>(_info.b_matrix.ld()),
-                       _info.b_matrix.stride,
+                _info.b_matrix.stride,
-                       &beta,
+                &beta,
-                       c,
+                c,
-                       c_type,
+                c_type,
-                       static_cast<int>(_info.c_matrix.ld()),
+                static_cast<int>(_info.c_matrix.ld()),
-                       _info.c_matrix.stride,
+                _info.c_matrix.stride,
-                       static_cast<int>(_info.batch),
+                static_cast<int>(_info.batch),
-                       compute_type,
+                compute_type,
-                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-               });
+        });
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace matmul::cuda
+} // namespace op::matmul::cuda
--- a/src/infiniop/ops/matmul/kunlun/matmul_kunlun.cc
+++ b/src/infiniop/ops/matmul/kunlun/matmul_kunlun.cc
@@ -2,7 +2,7 @@
 #include "../../../devices/kunlun/common_kunlun.h"
 #include "../../utils.h"
-namespace matmul::kunlun {
+namespace op::matmul::kunlun {
 struct Descriptor::Opaque {
    std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handle_pool;
@@ -110,4 +110,4 @@ infiniStatus_t Descriptor::calculate(
    }
 }
-} // namespace matmul::kunlun
+} // namespace op::matmul::kunlun
--- a/src/infiniop/ops/matmul/matmul.h
+++ b/src/infiniop/ops/matmul/matmul.h
@@ -46,7 +46,7 @@
 #define DESCRIPTOR(NAMESPACE)                             \
                                                          \
-    namespace matmul::NAMESPACE {                         \
+    namespace op::matmul::NAMESPACE {                     \
    class Descriptor final : public InfiniopDescriptor {  \
        struct Opaque;                                    \
        Opaque *_opaque;                                  \

--- a/src/infiniop/ops/matmul/operator.cc
+++ b/src/infiniop/ops/matmul/operator.cc
@@ -25,13 +25,13 @@ __C infiniStatus_t infiniopCreateMatmulDescriptor(
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-#define CREATE(CASE, NAMESPACE)                                           \
+#define CREATE(CASE, NAMESPACE)                                               \
-    case CASE:                                                            \
+    case CASE:                                                                \
-        return matmul::NAMESPACE::Descriptor::create(                     \
+        return op::matmul::NAMESPACE::Descriptor::create(                     \
-            handle,                                                       \
+            handle,                                                           \
-            reinterpret_cast<matmul::NAMESPACE::Descriptor **>(desc_ptr), \
+            reinterpret_cast<op::matmul::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                       \
+            c_desc,                                                           \
-            a_desc,                                                       \
+            a_desc,                                                           \
            b_desc)
    switch (handle->device) {
@@ -64,9 +64,9 @@ infiniopGetMatmulWorkspaceSize(
    infiniopMatmulDescriptor_t desc,
    size_t *size) {
-#define GET(CASE, NAMESPACE)                                                                   \
+#define GET(CASE, NAMESPACE)                                                                       \
-    case CASE:                                                                                 \
+    case CASE:                                                                                     \
-        *size = reinterpret_cast<const matmul::NAMESPACE::Descriptor *>(desc)->workspace_size; \
+        *size = reinterpret_cast<const op::matmul::NAMESPACE::Descriptor *>(desc)->workspace_size; \
        return INFINI_STATUS_SUCCESS
    switch (desc->device_type) {
@@ -104,12 +104,12 @@ __C infiniStatus_t infiniopMatmul(
    float beta,
    void *stream) {
-#define CALCULATE(CASE, NAMESPACE)                                           \
+#define CALCULATE(CASE, NAMESPACE)                                               \
-    case CASE:                                                               \
+    case CASE:                                                                   \
-        return reinterpret_cast<const matmul::NAMESPACE::Descriptor *>(desc) \
+        return reinterpret_cast<const op::matmul::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size,                           \
+            ->calculate(workspace, workspace_size,                               \
-                        c, beta,                                             \
+                        c, beta,                                                 \
-                        a, b, alpha,                                         \
+                        a, b, alpha,                                             \
                        stream)
    switch (desc->device_type) {
@@ -140,9 +140,9 @@ __C infiniStatus_t infiniopMatmul(
 __C infiniStatus_t
 infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
-#define DELETE(CASE, NAMESPACE)                                               \
+#define DELETE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                \
+    case CASE:                                                                    \
-        delete reinterpret_cast<const matmul::NAMESPACE::Descriptor *>(desc); \
+        delete reinterpret_cast<const op::matmul::NAMESPACE::Descriptor *>(desc); \
        return INFINI_STATUS_SUCCESS;
    switch (desc->device_type) {

--- a/test/infiniop/libinfiniop/liboperators.py
+++ b/test/infiniop/libinfiniop/liboperators.py
@@ -24,7 +24,7 @@ class CTensor:
        self.descriptor = desc
        self.torch_tensor_ = torch_tensor
        self.data = torch_tensor.data_ptr()
    def destroyDesc(self, lib_):
        lib_.infiniopDestroyTensorDescriptor(self.descriptor)
        self.descriptor = None
@@ -37,6 +37,20 @@ class Handle(Structure):
 infiniopHandle_t = POINTER(Handle)
+class InfiniLib:
+    def __init__(self, librt, libop):
+        self.librt = librt
+        self.libop = libop
+    def __getattr__(self, name):
+        if hasattr(self.libop, name):
+            return getattr(self.libop, name)
+        elif hasattr(self.librt, name):
+            return getattr(self.librt, name)
+        else:
+            raise AttributeError(f"Attribute {name} not found in library")
 # Open operators library
 def open_lib():
    def find_library_in_ld_path(subdir, library_name):
@@ -51,14 +65,22 @@ def open_lib():
    system_name = platform.system()
    # Load the library
    if system_name == "Windows":
-        library_path = find_library_in_ld_path("bin", "infiniop.dll")
+        libop_path = find_library_in_ld_path("bin", "infiniop.dll")
+        librt_path = find_library_in_ld_path("bin", "infinirt.dll")
    elif system_name == "Linux":
-        library_path = find_library_in_ld_path("lib", "libinfiniop.so")
+        libop_path = find_library_in_ld_path("lib", "libinfiniop.so")
+        librt_path = find_library_in_ld_path("lib", "libinfinirt.so")
    assert (
-        library_path is not None
+        libop_path is not None
    ), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
-    lib = ctypes.CDLL(library_path)
+    assert (
+        librt_path is not None
+    ), f"Cannot find infinirt.dll or libinfinirt.so. Check if INFINI_ROOT is set correctly."
+    librt = ctypes.CDLL(librt_path)
+    libop = ctypes.CDLL(libop_path)
+    lib = InfiniLib(librt, libop)
    lib.infiniopCreateTensorDescriptor.argtypes = [
        POINTER(infiniopTensorDescriptor_t),
        c_uint64,
@@ -69,9 +91,11 @@ def open_lib():
    lib.infiniopCreateTensorDescriptor.restype = c_int
    lib.infiniopDestroyTensorDescriptor.argtypes = [infiniopTensorDescriptor_t]
    lib.infiniopDestroyTensorDescriptor.restype = c_int
-    lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t), c_int, c_int]
+    lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t)]
    lib.infiniopCreateHandle.restype = c_int
    lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t]
    lib.infiniopDestroyHandle.restype = c_int
+    lib.infinirtSetDevice.argtypes = [c_int, c_int]
+    lib.infinirtSetDevice.restype = c_int
    return lib
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -30,7 +30,7 @@ def to_tensor(tensor, lib):
        InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else
        InfiniDtype.F32 if tensor.dtype == torch.float32 else
        InfiniDtype.F64 if tensor.dtype == torch.float64 else
-        # TODO: These following types may not be supported by older 
+        # TODO: These following types may not be supported by older
        # versions of PyTorch.
        InfiniDtype.U16 if tensor.dtype == torch.uint16 else
        InfiniDtype.U32 if tensor.dtype == torch.uint32 else
@@ -57,9 +57,9 @@ def create_workspace(size, torch_device):
    return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
-def create_handle(lib, device, id=0):
+def create_handle(lib):
    handle = infiniopHandle_t()
-    check_error(lib.infiniopCreateHandle(ctypes.byref(handle), device, id))
+    check_error(lib.infiniopCreateHandle(ctypes.byref(handle)))
    return handle
@@ -392,7 +392,8 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
        to be passed to `test_func`.
    - tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
    """
-    handle = create_handle(lib, device)
+    lib.infinirtSetDevice(device, ctypes.c_int(0))
+    handle = create_handle(lib)
    try:
        for test_case in test_cases:
            for tensor_dtype in tensor_dtypes:
@@ -435,6 +436,7 @@ def get_test_devices(args):
        devices_to_test.append(InfiniDeviceEnum.ASCEND)
    if args.kunlun:
        import torch_xmlir
        devices_to_test.append(InfiniDeviceEnum.KUNLUN)
    if not devices_to_test:
        devices_to_test = [InfiniDeviceEnum.CPU]