issue/87/feat: cublas 和 cudnn 检查并返回错误信息

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/87/feat: cublas 和 cudnn 检查并返回错误信息
Signed-off-by: YdrMaster <ydrml@hotmail.com>
39bf75c5 · YdrMaster · 01dae971 · 39bf75c5 · 39bf75c5 · 39bf75c5
Commit 39bf75c5 authored Mar 07, 2025 by YdrMaster
4 changed files
--- a/src/infiniop/devices/cuda/cuda_functions.cuh
+++ b/src/infiniop/devices/cuda/cuda_functions.cuh
+#ifndef __INFINIOP_CUDA_FUNCTIONS_CUH__
+#define __INFINIOP_CUDA_FUNCTIONS_CUH__
+#include "infinicore.h"
+#include <cudnn.h>
+namespace device::cuda {
+cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
+// return the memory offset of original tensor, given the flattened index of broadcasted tensor
+__forceinline__ __device__ __host__ size_t
+indexToReducedOffset(
+    size_t flat_index,
+    size_t ndim,
+    const ptrdiff_t *broadcasted_strides,
+    const ptrdiff_t *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i] * target_strides[i];
+        flat_index %= broadcasted_strides[i];
+    }
+    return res;
+}
+// get the memory offset of the given element in a tensor given its flat index
+__forceinline__ __device__ __host__ size_t
+indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const size_t *shape,
+    const ptrdiff_t *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- > 0;) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+} // namespace device::cuda
+#endif // __INFINIOP_CUDA_FUNCTIONS_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.cu
+++ b/src/infiniop/devices/cuda/cuda_handle.cu
@@ -4,35 +4,56 @@ namespace device::cuda {
 Handle::Handle(infiniDevice_t device, int device_id)
    : InfiniopHandle{device, device_id},
-      _internal(std::make_shared<Handle::Internal>()) {}
+      _internal(std::make_shared<Handle::Internal>(device_id)) {}
 auto Handle::internal() const -> const std::shared_ptr<Internal> & {
    return _internal;
 }
-template <typename T>
+Handle::Internal::Internal(int device_id) {
-using Fn = std::function<void(T)>;
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device_id);
+    _warp_size = prop.warpSize;
+    _max_threads_per_block = prop.maxThreadsPerBlock;
+    _block_size[0] = prop.maxThreadsDim[0];
+    _block_size[1] = prop.maxThreadsDim[1];
+    _block_size[2] = prop.maxThreadsDim[2];
+    _grid_size[0] = prop.maxGridSize[0];
+    _grid_size[1] = prop.maxGridSize[1];
+    _grid_size[2] = prop.maxGridSize[2];
+}
-void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
+infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
    auto handle = blas_handles.pop();
    if (!handle) {
-        cublasCreate(&(*handle));
+        CHECK_CUBLAS(cublasCreate(&(*handle)));
    }
-    cublasSetStream(*handle, stream);
+    CHECK_CUBLAS(cublasSetStream(*handle, stream));
-    f(*handle);
+    CHECK_STATUS(f(*handle));
    blas_handles.push(std::move(*handle));
+    return INFINI_STATUS_SUCCESS;
 }
-void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
+infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
    auto handle = dnn_handles.pop();
    if (!handle) {
-        cudnnCreate(&(*handle));
+        CHECK_CUDNN(cudnnCreate(&(*handle)));
    }
-    cudnnSetStream(*handle, stream);
+    CHECK_CUDNN(cudnnSetStream(*handle, stream));
-    f(*handle);
+    CHECK_STATUS(f(*handle));
    dnn_handles.push(std::move(*handle));
+    return INFINI_STATUS_SUCCESS;
 }
+int Handle::Internal::warpSize() const { return _warp_size; }
+int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
+int Handle::Internal::blockSizeX() const { return _block_size[0]; }
+int Handle::Internal::blockSizeY() const { return _block_size[1]; }
+int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
+int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
+int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
+int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
    switch (dt) {
    case INFINI_DTYPE_F16:

--- a/src/infiniop/devices/cuda/cuda_handle.cuh
+++ b/src/infiniop/devices/cuda/cuda_handle.cuh
-#ifndef __INFINIOP_CUDA_INTERNAL_H__
+#ifndef __INFINIOP_CUDA_HANDLE_CUH__
-#define __INFINIOP_CUDA_INTERNAL_H__
+#define __INFINIOP_CUDA_HANDLE_CUH__
+#include "../../../utils.h"
 #include "../pool.h"
+#include "cuda_functions.cuh"
 #include "cuda_handle.h"
 #include <cublas_v2.h>
-#include <cudnn.h>
 #include <functional>
+#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
+#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
 namespace device::cuda {
 class Handle::Internal {
    Pool<cublasHandle_t> blas_handles;
    Pool<cudnnHandle_t> dnn_handles;
+    int _warp_size,
+        _max_threads_per_block,
+        _block_size[3],
+        _grid_size[3];
+    template <typename T>
+    using Fn = std::function<infiniStatus_t(T)>;
 public:
-    void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const;
+    Internal(int);
-    void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
+    infiniStatus_t useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const;
+    infiniStatus_t useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const;
+    int warpSize() const;
+    int maxThreadsPerBlock() const;
+    int blockSizeX() const;
+    int blockSizeY() const;
+    int blockSizeZ() const;
+    int gridSizeX() const;
+    int gridSizeY() const;
+    int gridSizeZ() const;
 };
-cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-__forceinline__ __device__ __host__ size_t
-indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-// get the memory offset of the given element in a tensor given its flat index
-__forceinline__ __device__ __host__ size_t
-indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
 } // namespace device::cuda
-#endif // __INFINIOP_CUDA_INTERNAL_H__
+#endif // __INFINIOP_CUDA_HANDLE_CUH__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
@@ -76,9 +76,10 @@ infiniStatus_t Descriptor::calculate(
    auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
    auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
-    _opaque->internal->use_cublas(
+    _opaque->internal->useCublas(
        (cudaStream_t)stream,
        [&](cublasHandle_t handle) {
+            CHECK_CUBLAS(
                cublasGemmStridedBatchedEx(
                    handle,
                    op_a,
@@ -102,7 +103,8 @@ infiniStatus_t Descriptor::calculate(
                    _info.c_matrix.stride,
                    static_cast<int>(_info.batch),
                    compute_type,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            return INFINI_STATUS_SUCCESS;
        });
    return INFINI_STATUS_SUCCESS;
 }