Commit 39bf75c5 authored by YdrMaster's avatar YdrMaster
Browse files

issue/87/feat: cublas 和 cudnn 检查并返回错误信息


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent 01dae971
#ifndef __INFINIOP_CUDA_FUNCTIONS_CUH__
#define __INFINIOP_CUDA_FUNCTIONS_CUH__
#include "infinicore.h"
#include <cudnn.h>
namespace device::cuda {
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda
#endif // __INFINIOP_CUDA_FUNCTIONS_CUH__
...@@ -4,35 +4,56 @@ namespace device::cuda { ...@@ -4,35 +4,56 @@ namespace device::cuda {
Handle::Handle(infiniDevice_t device, int device_id) Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id}, : InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {} _internal(std::make_shared<Handle::Internal>(device_id)) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & { auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal; return _internal;
} }
template <typename T> Handle::Internal::Internal(int device_id) {
using Fn = std::function<void(T)>; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
_warp_size = prop.warpSize;
_max_threads_per_block = prop.maxThreadsPerBlock;
_block_size[0] = prop.maxThreadsDim[0];
_block_size[1] = prop.maxThreadsDim[1];
_block_size[2] = prop.maxThreadsDim[2];
_grid_size[0] = prop.maxGridSize[0];
_grid_size[1] = prop.maxGridSize[1];
_grid_size[2] = prop.maxGridSize[2];
}
void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const { infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
auto handle = blas_handles.pop(); auto handle = blas_handles.pop();
if (!handle) { if (!handle) {
cublasCreate(&(*handle)); CHECK_CUBLAS(cublasCreate(&(*handle)));
} }
cublasSetStream(*handle, stream); CHECK_CUBLAS(cublasSetStream(*handle, stream));
f(*handle); CHECK_STATUS(f(*handle));
blas_handles.push(std::move(*handle)); blas_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
} }
void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const { infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
auto handle = dnn_handles.pop(); auto handle = dnn_handles.pop();
if (!handle) { if (!handle) {
cudnnCreate(&(*handle)); CHECK_CUDNN(cudnnCreate(&(*handle)));
} }
cudnnSetStream(*handle, stream); CHECK_CUDNN(cudnnSetStream(*handle, stream));
f(*handle); CHECK_STATUS(f(*handle));
dnn_handles.push(std::move(*handle)); dnn_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
} }
int Handle::Internal::warpSize() const { return _warp_size; }
int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
int Handle::Internal::blockSizeX() const { return _block_size[0]; }
int Handle::Internal::blockSizeY() const { return _block_size[1]; }
int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
cudnnDataType_t getCudnnDtype(infiniDtype_t dt) { cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
switch (dt) { switch (dt) {
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
......
#ifndef __INFINIOP_CUDA_INTERNAL_H__ #ifndef __INFINIOP_CUDA_HANDLE_CUH__
#define __INFINIOP_CUDA_INTERNAL_H__ #define __INFINIOP_CUDA_HANDLE_CUH__
#include "../../../utils.h"
#include "../pool.h" #include "../pool.h"
#include "cuda_functions.cuh"
#include "cuda_handle.h" #include "cuda_handle.h"
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h>
#include <functional> #include <functional>
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace device::cuda { namespace device::cuda {
class Handle::Internal { class Handle::Internal {
Pool<cublasHandle_t> blas_handles; Pool<cublasHandle_t> blas_handles;
Pool<cudnnHandle_t> dnn_handles; Pool<cudnnHandle_t> dnn_handles;
int _warp_size,
_max_threads_per_block,
_block_size[3],
_grid_size[3];
template <typename T>
using Fn = std::function<infiniStatus_t(T)>;
public: public:
void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const; Internal(int);
void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
infiniStatus_t useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const;
infiniStatus_t useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const;
int warpSize() const;
int maxThreadsPerBlock() const;
int blockSizeX() const;
int blockSizeY() const;
int blockSizeZ() const;
int gridSizeX() const;
int gridSizeY() const;
int gridSizeZ() const;
}; };
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda } // namespace device::cuda
#endif // __INFINIOP_CUDA_INTERNAL_H__ #endif // __INFINIOP_CUDA_HANDLE_CUH__
...@@ -76,9 +76,10 @@ infiniStatus_t Descriptor::calculate( ...@@ -76,9 +76,10 @@ infiniStatus_t Descriptor::calculate(
auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
_opaque->internal->use_cublas( _opaque->internal->useCublas(
(cudaStream_t)stream, (cudaStream_t)stream,
[&](cublasHandle_t handle) { [&](cublasHandle_t handle) {
CHECK_CUBLAS(
cublasGemmStridedBatchedEx( cublasGemmStridedBatchedEx(
handle, handle,
op_a, op_a,
...@@ -102,7 +103,8 @@ infiniStatus_t Descriptor::calculate( ...@@ -102,7 +103,8 @@ infiniStatus_t Descriptor::calculate(
_info.c_matrix.stride, _info.c_matrix.stride,
static_cast<int>(_info.batch), static_cast<int>(_info.batch),
compute_type, compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
return INFINI_STATUS_SUCCESS;
}); });
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment