Unverified Commit 92ad2426 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #95 from YdrMaster/main

issue/87/feat: cublas 和 cudnn 检查并返回错误信息
parents d5422e5b 911115fb
...@@ -4,35 +4,56 @@ namespace device::cuda { ...@@ -4,35 +4,56 @@ namespace device::cuda {
Handle::Handle(infiniDevice_t device, int device_id) Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id}, : InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {} _internal(std::make_shared<Handle::Internal>(device_id)) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & { auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal; return _internal;
} }
template <typename T> Handle::Internal::Internal(int device_id) {
using Fn = std::function<void(T)>; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
_warp_size = prop.warpSize;
_max_threads_per_block = prop.maxThreadsPerBlock;
_block_size[0] = prop.maxThreadsDim[0];
_block_size[1] = prop.maxThreadsDim[1];
_block_size[2] = prop.maxThreadsDim[2];
_grid_size[0] = prop.maxGridSize[0];
_grid_size[1] = prop.maxGridSize[1];
_grid_size[2] = prop.maxGridSize[2];
}
void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const { infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
auto handle = blas_handles.pop(); auto handle = blas_handles.pop();
if (!handle) { if (!handle) {
cublasCreate(&(*handle)); CHECK_CUBLAS(cublasCreate(&(*handle)));
} }
cublasSetStream(*handle, stream); CHECK_CUBLAS(cublasSetStream(*handle, stream));
f(*handle); CHECK_STATUS(f(*handle));
blas_handles.push(std::move(*handle)); blas_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
} }
void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const { infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
auto handle = dnn_handles.pop(); auto handle = dnn_handles.pop();
if (!handle) { if (!handle) {
cudnnCreate(&(*handle)); CHECK_CUDNN(cudnnCreate(&(*handle)));
} }
cudnnSetStream(*handle, stream); CHECK_CUDNN(cudnnSetStream(*handle, stream));
f(*handle); CHECK_STATUS(f(*handle));
dnn_handles.push(std::move(*handle)); dnn_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
} }
int Handle::Internal::warpSize() const { return _warp_size; }
int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
int Handle::Internal::blockSizeX() const { return _block_size[0]; }
int Handle::Internal::blockSizeY() const { return _block_size[1]; }
int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
cudnnDataType_t getCudnnDtype(infiniDtype_t dt) { cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
switch (dt) { switch (dt) {
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
......
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h"
namespace device::cuda {
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda
#endif // __INFINIOP_CUDA_COMMON_CUH__
#ifndef __INFINIOP_CUDA_INTERNAL_H__ #ifndef __INFINIOP_CUDA_HANDLE_CUH__
#define __INFINIOP_CUDA_INTERNAL_H__ #define __INFINIOP_CUDA_HANDLE_CUH__
#include "../../../utils.h"
#include "../pool.h" #include "../pool.h"
#include "cuda_handle.h" #include "cuda_handle.h"
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h> #include <cudnn.h>
#include <functional> #include <functional>
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace device::cuda { namespace device::cuda {
class Handle::Internal { class Handle::Internal {
Pool<cublasHandle_t> blas_handles; Pool<cublasHandle_t> blas_handles;
Pool<cudnnHandle_t> dnn_handles; Pool<cudnnHandle_t> dnn_handles;
int _warp_size,
_max_threads_per_block,
_block_size[3],
_grid_size[3];
template <typename T>
using Fn = std::function<infiniStatus_t(T)>;
public: public:
void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const; Internal(int);
void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
infiniStatus_t useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const;
infiniStatus_t useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const;
int warpSize() const;
int maxThreadsPerBlock() const;
int blockSizeX() const;
int blockSizeY() const;
int blockSizeZ() const;
int gridSizeX() const;
int gridSizeY() const;
int gridSizeZ() const;
}; };
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda } // namespace device::cuda
#endif // __INFINIOP_CUDA_INTERNAL_H__ #endif // __INFINIOP_CUDA_HANDLE_CUH__
...@@ -76,34 +76,36 @@ infiniStatus_t Descriptor::calculate( ...@@ -76,34 +76,36 @@ infiniStatus_t Descriptor::calculate(
auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
_opaque->internal->use_cublas( CHECK_STATUS(_opaque->internal->useCublas(
(cudaStream_t)stream, (cudaStream_t)stream,
[&](cublasHandle_t handle) { [&](cublasHandle_t handle) {
cublasGemmStridedBatchedEx( CHECK_CUBLAS(
handle, cublasGemmStridedBatchedEx(
op_a, handle,
op_b, op_a,
static_cast<int>(_info.m), op_b,
static_cast<int>(_info.n), static_cast<int>(_info.m),
static_cast<int>(_info.k), static_cast<int>(_info.n),
&alpha, static_cast<int>(_info.k),
a, &alpha,
a_type, a,
static_cast<int>(_info.a_matrix.ld()), a_type,
_info.a_matrix.stride, static_cast<int>(_info.a_matrix.ld()),
b, _info.a_matrix.stride,
b_type, b,
static_cast<int>(_info.b_matrix.ld()), b_type,
_info.b_matrix.stride, static_cast<int>(_info.b_matrix.ld()),
&beta, _info.b_matrix.stride,
c, &beta,
c_type, c,
static_cast<int>(_info.c_matrix.ld()), c_type,
_info.c_matrix.stride, static_cast<int>(_info.c_matrix.ld()),
static_cast<int>(_info.batch), _info.c_matrix.stride,
compute_type, static_cast<int>(_info.batch),
CUBLAS_GEMM_DEFAULT_TENSOR_OP); compute_type,
}); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
return INFINI_STATUS_SUCCESS;
}));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment