Merge pull request #95 from YdrMaster/main

issue/87/feat: cublas 和 cudnn 检查并返回错误信息

Merge pull request #95 from YdrMaster/main
issue/87/feat: cublas 和 cudnn 检查并返回错误信息
92ad2426 · PanZezhong1725 · GitHub · d5422e5b · 911115fb · 92ad2426
Unverified Commit 92ad2426 authored Mar 10, 2025 by PanZezhong1725 Committed by GitHub Mar 10, 2025
4 changed files
--- a/src/infiniop/devices/cuda/cuda_handle.cu
+++ b/src/infiniop/devices/cuda/cuda_handle.cu
@@ -4,35 +4,56 @@ namespace device::cuda {

 Handle::Handle(infiniDevice_t device, int device_id)
    : InfiniopHandle{device, device_id},
-      _internal(std::make_shared<Handle::Internal>()) {}
+      _internal(std::make_shared<Handle::Internal>(device_id)) {}

 auto Handle::internal() const -> const std::shared_ptr<Internal> & {
    return _internal;
 }

-template <typename T>
-using Fn = std::function<void(T)>;
+Handle::Internal::Internal(int device_id) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device_id);
+    _warp_size = prop.warpSize;
+    _max_threads_per_block = prop.maxThreadsPerBlock;
+    _block_size[0] = prop.maxThreadsDim[0];
+    _block_size[1] = prop.maxThreadsDim[1];
+    _block_size[2] = prop.maxThreadsDim[2];
+    _grid_size[0] = prop.maxGridSize[0];
+    _grid_size[1] = prop.maxGridSize[1];
+    _grid_size[2] = prop.maxGridSize[2];
+}

-void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
+infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
    auto handle = blas_handles.pop();
    if (!handle) {
-        cublasCreate(&(*handle));
+        CHECK_CUBLAS(cublasCreate(&(*handle)));
    }
-    cublasSetStream(*handle, stream);
-    f(*handle);
+    CHECK_CUBLAS(cublasSetStream(*handle, stream));
+    CHECK_STATUS(f(*handle));
    blas_handles.push(std::move(*handle));
+    return INFINI_STATUS_SUCCESS;
 }

-void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
+infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
    auto handle = dnn_handles.pop();
    if (!handle) {
-        cudnnCreate(&(*handle));
+        CHECK_CUDNN(cudnnCreate(&(*handle)));
    }
-    cudnnSetStream(*handle, stream);
-    f(*handle);
+    CHECK_CUDNN(cudnnSetStream(*handle, stream));
+    CHECK_STATUS(f(*handle));
    dnn_handles.push(std::move(*handle));
+    return INFINI_STATUS_SUCCESS;
 }

+int Handle::Internal::warpSize() const { return _warp_size; }
+int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
+int Handle::Internal::blockSizeX() const { return _block_size[0]; }
+int Handle::Internal::blockSizeY() const { return _block_size[1]; }
+int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
+int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
+int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
+int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
+
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
    switch (dt) {
    case INFINI_DTYPE_F16:

--- a/src/infiniop/devices/cuda/cuda_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_common.cuh
+#ifndef __INFINIOP_CUDA_COMMON_CUH__
+#define __INFINIOP_CUDA_COMMON_CUH__
+
+#include "cuda_handle.cuh"
+#include "infinicore.h"
+
+namespace device::cuda {
+
+cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
+
+// return the memory offset of original tensor, given the flattened index of broadcasted tensor
+__forceinline__ __device__ __host__ size_t
+indexToReducedOffset(
+    size_t flat_index,
+    size_t ndim,
+    const ptrdiff_t *broadcasted_strides,
+    const ptrdiff_t *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i] * target_strides[i];
+        flat_index %= broadcasted_strides[i];
+    }
+    return res;
+}
+
+// get the memory offset of the given element in a tensor given its flat index
+__forceinline__ __device__ __host__ size_t
+indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const size_t *shape,
+    const ptrdiff_t *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- > 0;) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+} // namespace device::cuda
+
+#endif // __INFINIOP_CUDA_COMMON_CUH__
--- a/src/infiniop/devices/cuda/cuda_handle.cuh
+++ b/src/infiniop/devices/cuda/cuda_handle.cuh
-#ifndef __INFINIOP_CUDA_INTERNAL_H__
-#define __INFINIOP_CUDA_INTERNAL_H__
+#ifndef __INFINIOP_CUDA_HANDLE_CUH__
+#define __INFINIOP_CUDA_HANDLE_CUH__

+#include "../../../utils.h"
 #include "../pool.h"
 #include "cuda_handle.h"
 #include <cublas_v2.h>
 #include <cudnn.h>
 #include <functional>

+#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
+#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
+
 namespace device::cuda {

 class Handle::Internal {
    Pool<cublasHandle_t> blas_handles;
    Pool<cudnnHandle_t> dnn_handles;

+    int _warp_size,
+        _max_threads_per_block,
+        _block_size[3],
+        _grid_size[3];
+
+    template <typename T>
+    using Fn = std::function<infiniStatus_t(T)>;
+
 public:
-    void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const;
-    void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
+    Internal(int);
+
+    infiniStatus_t useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const;
+    infiniStatus_t useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const;
+
+    int warpSize() const;
+    int maxThreadsPerBlock() const;
+    int blockSizeX() const;
+    int blockSizeY() const;
+    int blockSizeZ() const;
+    int gridSizeX() const;
+    int gridSizeY() const;
+    int gridSizeZ() const;
 };

-cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
-
-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-__forceinline__ __device__ __host__ size_t
-indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-
-// get the memory offset of the given element in a tensor given its flat index
-__forceinline__ __device__ __host__ size_t
-indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
-
 } // namespace device::cuda

-#endif // __INFINIOP_CUDA_INTERNAL_H__
+#endif // __INFINIOP_CUDA_HANDLE_CUH__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
@@ -76,9 +76,10 @@ infiniStatus_t Descriptor::calculate(
    auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
    auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;

-    _opaque->internal->use_cublas(
+    CHECK_STATUS(_opaque->internal->useCublas(
        (cudaStream_t)stream,
        [&](cublasHandle_t handle) {
+            CHECK_CUBLAS(
                cublasGemmStridedBatchedEx(
                    handle,
                    op_a,
@@ -102,8 +103,9 @@ infiniStatus_t Descriptor::calculate(
                    _info.c_matrix.stride,
                    static_cast<int>(_info.batch),
                    compute_type,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-        });
+                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            return INFINI_STATUS_SUCCESS;
+        }));
    return INFINI_STATUS_SUCCESS;
 }