issue/42: 创建infiniop handle不再传入device_id

9bc9ca91 · Pan Zezhong · 643fdd2b · 9bc9ca91 · 9bc9ca91 · 9bc9ca91
Commit 9bc9ca91 authored Feb 17, 2025 by Pan Zezhong
18 changed files
--- a/include/infiniop/handle.h
+++ b/include/infiniop/handle.h
@@ -11,7 +11,7 @@ typedef struct InfiniopHandle {

 typedef InfiniopHandle *infiniopHandle_t;

-__C __export infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDevice_t device, int device_id);
+__C __export infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDevice_t device);

 __C __export infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);


--- a/src/infiniop/devices/ascend/ascend_handle.cc
+++ b/src/infiniop/devices/ascend/ascend_handle.cc
 #include "common_ascend.h"

-infiniopStatus_t createAscendHandle(infiniopAscendHandle_t *handle_ptr,
-                                    int device_id) {
-    uint32_t device_count;
-    aclrtGetDeviceCount(&device_count);
-    if (device_id >= static_cast<int>(device_count)) {
-        return INFINIOP_STATUS_BAD_DEVICE;
-    }
-
-    auto ret = aclrtSetDevice(device_id);
+infiniopStatus_t createAscendHandle(infiniopAscendHandle_t *handle_ptr) {
+    int device_id = 0;
+    auto ret = aclrtGetDevice(&device_id);
    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
+              LOG_ERROR("aclrtGetDevice failed. ERROR: %d\n", ret));

    *handle_ptr = new InfiniopAscendHandle{INFINI_DEVICE_ASCEND, device_id};


--- a/src/infiniop/devices/ascend/ascend_handle.h
+++ b/src/infiniop/devices/ascend/ascend_handle.h
@@ -7,8 +7,7 @@
 struct InfiniopAscendHandle;
 typedef struct InfiniopAscendHandle *infiniopAscendHandle_t;

-infiniopStatus_t createAscendHandle(infiniopAscendHandle_t *handle_ptr,
-                                    int device_id);
+infiniopStatus_t createAscendHandle(infiniopAscendHandle_t *handle_ptr);

 infiniopStatus_t destroyAscendHandle(infiniopAscendHandle_t handle_ptr);


--- a/src/infiniop/devices/ascend/common_ascend.h
+++ b/src/infiniop/devices/ascend/common_ascend.h
@@ -28,6 +28,13 @@ extern "C" {
        printf(message, ##__VA_ARGS__); \
    } while (0)

+#define LOG_ERROR(message, ...)                \
+    do {                                       \
+        printf(message, ##__VA_ARGS__);        \
+        return INFINIOP_STATUS_INTERNAL_ERROR; \
+    } while (0)
+
+
 #ifdef __cplusplus
 };
 #endif

--- a/src/infiniop/devices/bang/bang_handle.cc
+++ b/src/infiniop/devices/bang/bang_handle.cc
@@ -2,18 +2,13 @@
 #include "common_bang.h"
 #include <memory>

-infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr,
-                                  int device_id) {
-    unsigned int device_count;
-    cnrtGetDeviceCount(&device_count);
-    if (device_id >= static_cast<int>(device_count)) {
+infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr) {
+    int device_id = 0;
+    if (cnrtGetDevice(&device_id) != cnrtSuccess) {
        return INFINIOP_STATUS_BAD_DEVICE;
    }

    auto pool = std::make_shared<Pool<cnnlHandle_t>>();
-    if (cnrtSetDevice(device_id) != cnrtSuccess) {
-        return INFINIOP_STATUS_BAD_DEVICE;
-    }
    cnnlHandle_t handle;
    cnnlCreate(&handle);
    pool->push(std::move(handle));

--- a/src/infiniop/devices/bang/bang_handle.h
+++ b/src/infiniop/devices/bang/bang_handle.h
@@ -6,8 +6,7 @@
 struct InfiniopBangHandle;
 typedef struct InfiniopBangHandle *infiniopBangHandle_t;

-infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr,
-                                  int device_id);
+infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr);
 infiniopStatus_t destroyBangHandle(infiniopBangHandle_t handle);

 #endif
--- a/src/infiniop/devices/bang/common_bang.h
+++ b/src/infiniop/devices/bang/common_bang.h
@@ -17,7 +17,7 @@
 struct InfiniopBangHandle {
    infiniDevice_t device;
    int device_id;
-    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
 };

 inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {

--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
@@ -7,91 +7,89 @@

 #include <iostream>

-#define checkCudaErrorWithCode(call, errorCode)                                \
-    do {                                                                       \
-        if (auto status = call; status != cudaSuccess) {                       \
-            std::cerr << "CUDA error: " << cudaGetErrorString(status)          \
-                      << " in file " << __FILE__ << ", function " << __func__  \
-                      << ", line " << __LINE__ << std::endl;                   \
-            return errorCode;                                                  \
-        }                                                                      \
+#define CHECK_CUDA_OR_RETURN(call, errorCode)                                 \
+    do {                                                                      \
+        if (auto status = call; status != cudaSuccess) {                      \
+            std::cerr << "CUDA error: " << cudaGetErrorString(status)         \
+                      << " in file " << __FILE__ << ", function " << __func__ \
+                      << ", line " << __LINE__ << std::endl;                  \
+            return errorCode;                                                 \
+        }                                                                     \
    } while (0)

-#define checkCudaError(call) checkCudaErrorWithCode(call, INFINIOP_STATUS_BAD_DEVICE)
+#define CHECK_CUDA(call) CHECK_CUDA_OR_RETURN(call, INFINIOP_STATUS_INTERNAL_ERROR)

-#define checkCudnnError(call)                                                  \
-    do {                                                                       \
-        if (auto status = call; status != CUDNN_STATUS_SUCCESS) {              \
-            std::cerr << "CUDNN error: " << cudnnGetErrorString(status)        \
-                      << " in file " << __FILE__ << ", function " << __func__  \
-                      << ", line " << __LINE__ << std::endl;                   \
-            return INFINIOP_STATUS_INTERNAL_ERROR;                             \
-        }                                                                      \
+#define CHECK_CUDNN(call)                                                     \
+    do {                                                                      \
+        if (auto status = call; status != CUDNN_STATUS_SUCCESS) {             \
+            std::cerr << "CUDNN error: " << cudnnGetErrorString(status)       \
+                      << " in file " << __FILE__ << ", function " << __func__ \
+                      << ", line " << __LINE__ << std::endl;                  \
+            return INFINIOP_STATUS_INTERNAL_ERROR;                            \
+        }                                                                     \
    } while (0)

-#include "infinicore.h"
-#include <cudnn.h>
-#include <cublas_v2.h>
-#include <memory>
 #include "../pool.h"
 #include "cuda_handle.h"
+#include "infinicore.h"
+#include <cublas_v2.h>
 #include <cuda_fp16.h>
+#include <cudnn.h>
+#include <memory>

 struct InfiniopCudaHandle {
    infiniDevice_t device;
    int device_id;
-    std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
-    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
+    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool;
    cudaDeviceProp prop;
    int compute_capability_major;
    int compute_capability_minor;
 };

 template<typename T>
-void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t, int device_id, cudaStream_t stream, T const &f) {
-    auto handle = cublas_handles_t->pop();
+void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool, int device_id, cudaStream_t stream, T const &f) {
+    auto handle = cublas_handle_pool->pop();
    if (!handle) {
-        cudaSetDevice(device_id);
        cublasCreate(&(*handle));
    }
    cublasSetStream(*handle, (cudaStream_t) stream);
    f(*handle);
-    cublas_handles_t->push(std::move(*handle));
+    cublas_handle_pool->push(std::move(*handle));
 }

 template<typename T>
-cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t, int device_id, cudaStream_t stream, T const &f) {
-    auto handle = cudnn_handles_t->pop();
+cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool, int device_id, cudaStream_t stream, T const &f) {
+    auto handle = cudnn_handle_pool->pop();
    if (!handle) {
-        cudaSetDevice(device_id);
        cudnnCreate(&(*handle));
    }
    cudnnSetStream(*handle, stream);
    cudnnStatus_t status = f(*handle);
-    cudnn_handles_t->push(std::move(*handle));
+    cudnn_handle_pool->push(std::move(*handle));
    return status;
 }

 inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
    switch (dt) {
-    case INFINI_DTYPE_F16:
-        return CUDNN_DATA_HALF;
-    case INFINI_DTYPE_F32:
-        return CUDNN_DATA_FLOAT;
-    case INFINI_DTYPE_F64:
-        return CUDNN_DATA_DOUBLE;
-    case INFINI_DTYPE_BF16:
-        return CUDNN_DATA_BFLOAT16;
-    case INFINI_DTYPE_I8:
-        return CUDNN_DATA_INT8;
-    case INFINI_DTYPE_I32:
-        return CUDNN_DATA_INT32;
-    case INFINI_DTYPE_I64:
-        return CUDNN_DATA_INT64;
-    case INFINI_DTYPE_U8:
-        return CUDNN_DATA_UINT8;
-    default:
-        return CUDNN_DATA_FLOAT;
+        case INFINI_DTYPE_F16:
+            return CUDNN_DATA_HALF;
+        case INFINI_DTYPE_F32:
+            return CUDNN_DATA_FLOAT;
+        case INFINI_DTYPE_F64:
+            return CUDNN_DATA_DOUBLE;
+        case INFINI_DTYPE_BF16:
+            return CUDNN_DATA_BFLOAT16;
+        case INFINI_DTYPE_I8:
+            return CUDNN_DATA_INT8;
+        case INFINI_DTYPE_I32:
+            return CUDNN_DATA_INT32;
+        case INFINI_DTYPE_I64:
+            return CUDNN_DATA_INT64;
+        case INFINI_DTYPE_U8:
+            return CUDNN_DATA_UINT8;
+        default:
+            return CUDNN_DATA_FLOAT;
    }
 }

@@ -120,4 +118,4 @@ inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
    return res;
 }

-#endif // __INFINIOP_COMMON_CUDA_H__
+#endif// __INFINIOP_COMMON_CUDA_H__
--- a/src/infiniop/devices/cuda/cuda_handle.cu
+++ b/src/infiniop/devices/cuda/cuda_handle.cu
 #include "./common_cuda.cuh"

-infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_id, infiniDevice_t cuda_device_type) {
-    // Check if device_id is valid
-    int device_count;
-    cudaGetDeviceCount(&device_count);
-    if (device_id >= device_count) {
-        return INFINIOP_STATUS_BAD_DEVICE;
-    }
-
+infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type) {
    // Create a new cublas handle pool
+    int device_id = 0;
+    CHECK_CUDA(cudaGetDevice(&device_id));
    auto pool = std::make_shared<Pool<cublasHandle_t>>();
-    if (cudaSetDevice(device_id) != cudaSuccess) {
-        return INFINIOP_STATUS_BAD_DEVICE;
-    }
    cublasHandle_t handle;
    cublasCreate(&handle);
    pool->push(std::move(handle));
@@ -20,7 +12,7 @@ infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_i
    // create a cudnn handle pool
    auto cudnn_pool = std::make_shared<Pool<cudnnHandle_t>>();
    cudnnHandle_t cudnn_handle;
-    checkCudnnError(cudnnCreate(&cudnn_handle));
+    CHECK_CUDNN(cudnnCreate(&cudnn_handle));
    cudnn_pool->push(std::move(cudnn_handle));

    // set CUDA device property
@@ -47,8 +39,8 @@ infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_i
 }

 infiniopStatus_t destroyCudaHandle(infiniopCudaHandle_t handle_ptr) {
-    handle_ptr->cublas_handles_t = nullptr;
-    handle_ptr->cudnn_handles_t = nullptr;
+    handle_ptr->cublas_handle_pool = nullptr;
+    handle_ptr->cudnn_handle_pool = nullptr;
    delete handle_ptr;

    return INFINIOP_STATUS_SUCCESS;

--- a/src/infiniop/devices/cuda/cuda_handle.h
+++ b/src/infiniop/devices/cuda/cuda_handle.h
@@ -6,7 +6,7 @@
 struct InfiniopCudaHandle;
 typedef struct InfiniopCudaHandle *infiniopCudaHandle_t;

-infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_id, infiniDevice_t cuda_device_type);
+infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type);

 infiniopStatus_t destroyCudaHandle(infiniopCudaHandle_t handle_ptr);


--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -13,14 +13,10 @@
 #endif

 __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr,
-                                          infiniDevice_t device,
-                                          int device_id) {
+                                          infiniDevice_t device) {
    if (handle_ptr == nullptr) {
        return INFINIOP_STATUS_NULL_POINTER;
    }
-    if (device_id < 0) {
-        return INFINIOP_STATUS_BAD_DEVICE;
-    }

    switch (device) {
 #ifdef ENABLE_CPU_API
@@ -29,19 +25,17 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr,
 #endif
 #ifdef ENABLE_CUDA_API
    case INFINI_DEVICE_NVIDIA: {
-        return createCudaHandle((infiniopCudaHandle_t *)handle_ptr, device_id,
-                                device);
+        return createCudaHandle((infiniopCudaHandle_t *)handle_ptr, device);
    }
 #endif
 #ifdef ENABLE_CAMBRICON_API
    case INFINI_DEVICE_CAMBRICON: {
-        return createBangHandle((infiniopBangHandle_t *)handle_ptr, device_id);
+        return createBangHandle((infiniopBangHandle_t *)handle_ptr);
    }
 #endif
 #ifdef ENABLE_ASCEND_API
    case INFINI_DEVICE_ASCEND: {
-        return createAscendHandle((infiniopAscendHandle_t *)handle_ptr,
-                                  device_id);
+        return createAscendHandle((infiniopAscendHandle_t *)handle_ptr);
    }
 #endif
    }

--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
@@ -31,7 +31,7 @@ infiniopStatus_t bangCreateMatmulDescriptor(
    cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
                          sizeof(int32_t));
    int count = 0;
-    use_cnnl(handle->cnnl_handles, [&](cnnlHandle_t _handle) {
+    use_cnnl(handle->cnnl_handle_pool, [&](cnnlHandle_t _handle) {
        cnnlGetBatchMatMulAlgoHeuristic(_handle, opDesc, aDesc, bDesc, cDesc,
                                        NULL, 1, &algoResult, &count);
    });
@@ -42,7 +42,7 @@ infiniopStatus_t bangCreateMatmulDescriptor(
                                                 handle->device_id,
                                                 info,
                                                 c_desc->dtype,
-                                                 handle->cnnl_handles,
+                                                 handle->cnnl_handle_pool,
                                                 aDesc,
                                                 bDesc,
                                                 cDesc,
@@ -61,7 +61,7 @@ infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,

 infiniopStatus_t
 bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc) {
-    desc->cnnl_handles = nullptr;
+    desc->cnnl_handle_pool = nullptr;
    cnnlDestroyTensorDescriptor(desc->aDesc);
    cnnlDestroyTensorDescriptor(desc->bDesc);
    cnnlDestroyTensorDescriptor(desc->cDesc);
@@ -80,7 +80,7 @@ void bangMatmulCnnl(infiniopMatmulBangDescriptor_t desc, void *workspace, void *
        std::swap(a, b);
    }

-    use_cnnl(desc->cnnl_handles, (cnrtQueue_t)stream, [&](cnnlHandle_t handle) {
+    use_cnnl(desc->cnnl_handle_pool, (cnrtQueue_t)stream, [&](cnnlHandle_t handle) {
        cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo, &alpha,
                                desc->aDesc, a, desc->bDesc, b, &beta,
                                desc->cDesc, c, workspace,

--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.h
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.h
@@ -9,7 +9,7 @@ struct InfiniopMatmulBangDescriptor {
    int device_id;
    MatmulInfo info;
    infiniDtype_t dtype;
-    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
    cnnlTensorDescriptor_t aDesc;
    cnnlTensorDescriptor_t bDesc;
    cnnlTensorDescriptor_t cDesc;

--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
@@ -23,7 +23,7 @@ infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
        dtype,
        handle->device_id,
        info,
-        handle->cublas_handles_t};
+        handle->cublas_handle_pool};
    return INFINIOP_STATUS_SUCCESS;
 }

@@ -33,7 +33,7 @@ infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc,
 }

 infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc) {
-    desc->cublas_handles_t = nullptr;
+    desc->cublas_handle_pool = nullptr;
    delete desc;
    return INFINIOP_STATUS_SUCCESS;
 }
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
@@ -11,7 +11,7 @@ typedef struct InfiniopMatmulCudaDescriptor {
    infiniDtype_t dtype;
    int device_id;
    MatmulInfo info;
-    std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
 } InfiniopMatmulCudaDescriptor;

 #endif// __INFINIOP_MATMUL_CUDA_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
    auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
    auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;

-    use_cublas(desc->cublas_handles_t, desc->device_id, (cudaStream_t) stream,
+    use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t) stream,
               [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
                                                handle,
                                                op_a,

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -378,7 +378,9 @@ def get_test_devices(args):
        import torch_mlu
        devices_to_test.append(InfiniDeviceEnum.CAMBRICON)
    if args.ascend: 
+        import torch
        import torch_npu
+        torch.npu.set_device(0) # Ascend NPU needs explicit device initialization
        devices_to_test.append(InfiniDeviceEnum.ASCEND)
    if not devices_to_test:
        devices_to_test = [InfiniDeviceEnum.CPU]

--- a/test/infiniop/matmul.py
+++ b/test/infiniop/matmul.py
@@ -112,7 +112,7 @@ def test(
    def lib_matmul():
        check_error(lib.infiniopMatmul(
            descriptor, 
-            workspace.data_ptr() if workspace else None,
+            workspace.data_ptr() if workspace is not None else None,
            workspace_size.value,
            c_tensor.data,
            a_tensor.data,