Unverified Commit 92ad2426 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #95 from YdrMaster/main

issue/87/feat: cublas 和 cudnn 检查并返回错误信息
parents d5422e5b 911115fb
......@@ -4,35 +4,56 @@ namespace device::cuda {
Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {}
_internal(std::make_shared<Handle::Internal>(device_id)) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal;
}
template <typename T>
using Fn = std::function<void(T)>;
Handle::Internal::Internal(int device_id) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
_warp_size = prop.warpSize;
_max_threads_per_block = prop.maxThreadsPerBlock;
_block_size[0] = prop.maxThreadsDim[0];
_block_size[1] = prop.maxThreadsDim[1];
_block_size[2] = prop.maxThreadsDim[2];
_grid_size[0] = prop.maxGridSize[0];
_grid_size[1] = prop.maxGridSize[1];
_grid_size[2] = prop.maxGridSize[2];
}
void Handle::Internal::use_cublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const {
auto handle = blas_handles.pop();
if (!handle) {
cublasCreate(&(*handle));
CHECK_CUBLAS(cublasCreate(&(*handle)));
}
cublasSetStream(*handle, stream);
f(*handle);
CHECK_CUBLAS(cublasSetStream(*handle, stream));
CHECK_STATUS(f(*handle));
blas_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
}
void Handle::Internal::use_cudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const {
auto handle = dnn_handles.pop();
if (!handle) {
cudnnCreate(&(*handle));
CHECK_CUDNN(cudnnCreate(&(*handle)));
}
cudnnSetStream(*handle, stream);
f(*handle);
CHECK_CUDNN(cudnnSetStream(*handle, stream));
CHECK_STATUS(f(*handle));
dnn_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
}
int Handle::Internal::warpSize() const { return _warp_size; }
int Handle::Internal::maxThreadsPerBlock() const { return _max_threads_per_block; }
int Handle::Internal::blockSizeX() const { return _block_size[0]; }
int Handle::Internal::blockSizeY() const { return _block_size[1]; }
int Handle::Internal::blockSizeZ() const { return _block_size[2]; }
int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
int Handle::Internal::gridSizeY() const { return _grid_size[1]; }
int Handle::Internal::gridSizeZ() const { return _grid_size[2]; }
cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
switch (dt) {
case INFINI_DTYPE_F16:
......
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h"
namespace device::cuda {
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda
#endif // __INFINIOP_CUDA_COMMON_CUH__
#ifndef __INFINIOP_CUDA_INTERNAL_H__
#define __INFINIOP_CUDA_INTERNAL_H__
#ifndef __INFINIOP_CUDA_HANDLE_CUH__
#define __INFINIOP_CUDA_HANDLE_CUH__
#include "../../../utils.h"
#include "../pool.h"
#include "cuda_handle.h"
#include <cublas_v2.h>
#include <cudnn.h>
#include <functional>
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace device::cuda {
class Handle::Internal {
Pool<cublasHandle_t> blas_handles;
Pool<cudnnHandle_t> dnn_handles;
int _warp_size,
_max_threads_per_block,
_block_size[3],
_grid_size[3];
template <typename T>
using Fn = std::function<infiniStatus_t(T)>;
public:
void use_cublas(cudaStream_t stream, const std::function<void(cublasHandle_t)> &f) const;
void use_cudnn(cudaStream_t stream, const std::function<void(cudnnHandle_t)> &f) const;
Internal(int);
infiniStatus_t useCublas(cudaStream_t stream, const Fn<cublasHandle_t> &f) const;
infiniStatus_t useCudnn(cudaStream_t stream, const Fn<cudnnHandle_t> &f) const;
int warpSize() const;
int maxThreadsPerBlock() const;
int blockSizeX() const;
int blockSizeY() const;
int blockSizeZ() const;
int gridSizeX() const;
int gridSizeY() const;
int gridSizeZ() const;
};
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
} // namespace device::cuda
#endif // __INFINIOP_CUDA_INTERNAL_H__
#endif // __INFINIOP_CUDA_HANDLE_CUH__
......@@ -76,9 +76,10 @@ infiniStatus_t Descriptor::calculate(
auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
_opaque->internal->use_cublas(
CHECK_STATUS(_opaque->internal->useCublas(
(cudaStream_t)stream,
[&](cublasHandle_t handle) {
CHECK_CUBLAS(
cublasGemmStridedBatchedEx(
handle,
op_a,
......@@ -102,8 +103,9 @@ infiniStatus_t Descriptor::calculate(
_info.c_matrix.stride,
static_cast<int>(_info.batch),
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
});
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
return INFINI_STATUS_SUCCESS;
}));
return INFINI_STATUS_SUCCESS;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment