Commit ceb57c2a authored by YdrMaster's avatar YdrMaster
Browse files

issue/291/style: 根据实际情况将 cuda 改为 nvidia


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent d76a2607
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "cpu/cpu_handle.h" #include "cpu/cpu_handle.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "cuda/cuda_handle.h" #include "nvidia/nvidia_handle.h"
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
#include "bang/bang_handle.h" #include "bang/bang_handle.h"
...@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) { ...@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE(INFINI_DEVICE_CPU, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon); CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
...@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { ...@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE(INFINI_DEVICE_CPU, cpu); DELETE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon); DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
......
#include "cuda_handle.cuh" #include "nvidia_handle.cuh"
namespace device::cuda { namespace device::nvidia {
Handle::Handle(infiniDevice_t device, int device_id) Handle::Handle(int device_id)
: InfiniopHandle{device, device_id}, : InfiniopHandle{INFINI_DEVICE_NVIDIA, device_id},
_internal(std::make_shared<Handle::Internal>(device_id)) {} _internal(std::make_shared<Handle::Internal>(device_id)) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & { auto Handle::internal() const -> const std::shared_ptr<Internal> & {
...@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) { ...@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
} }
#endif #endif
namespace nvidia {
Handle::Handle(int device_id)
: cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) { infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(device_id); *handle_ptr = new Handle(device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace nvidia } // namespace device::nvidia
} // namespace device::cuda
#ifndef __INFINIOP_CUDA_COMMON_CUH__ #ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__ #define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h" #include "infinicore.h"
#include "nvidia_handle.cuh"
namespace device::cuda { namespace device::nvidia {
#ifdef ENABLE_CUDNN_API #ifdef ENABLE_CUDNN_API
cudnnDataType_t getCudnnDtype(infiniDtype_t dt); cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
#endif #endif
} // namespace device::cuda } // namespace device::nvidia
#endif // __INFINIOP_CUDA_COMMON_CUH__ #endif // __INFINIOP_CUDA_COMMON_CUH__
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../../../utils.h" #include "../../../utils.h"
#include "../pool.h" #include "../pool.h"
#include "cuda_handle.h" #include "nvidia_handle.h"
#include <cublas_v2.h> #include <cublas_v2.h>
#include <functional> #include <functional>
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS) #define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS) #define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace device::cuda { namespace device::nvidia {
class Handle::Internal { class Handle::Internal {
Pool<cublasHandle_t> blas_handles; Pool<cublasHandle_t> blas_handles;
...@@ -48,6 +48,6 @@ public: ...@@ -48,6 +48,6 @@ public:
int gridSizeZ() const; int gridSizeZ() const;
}; };
} // namespace device::cuda } // namespace device::nvidia
#endif // __INFINIOP_CUDA_HANDLE_CUH__ #endif // __INFINIOP_CUDA_HANDLE_CUH__
...@@ -4,30 +4,20 @@ ...@@ -4,30 +4,20 @@
#include "../../handle.h" #include "../../handle.h"
#include <memory> #include <memory>
namespace device::cuda { namespace device::nvidia {
struct Handle : public InfiniopHandle { struct Handle : public InfiniopHandle {
Handle(int device_id);
class Internal; class Internal;
auto internal() const -> const std::shared_ptr<Internal> &; auto internal() const -> const std::shared_ptr<Internal> &;
protected: public:
Handle(infiniDevice_t device, int device_id); static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
private: private:
std::shared_ptr<Internal> _internal; std::shared_ptr<Internal> _internal;
}; };
namespace nvidia { } // namespace device::nvidia
class Handle : public cuda::Handle {
Handle(int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
};
} // namespace nvidia
} // namespace device::cuda
#endif // __INFINIOP_CUDA_HANDLE_H__ #endif // __INFINIOP_CUDA_HANDLE_H__
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
using cuda_bfloat16 = nv_bfloat16; using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162; using cuda_bfloat162 = nv_bfloat162;
namespace device::cuda { namespace device::nvidia {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor // return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t __forceinline__ __device__ __host__ size_t
indexToReducedOffset( indexToReducedOffset(
...@@ -48,7 +48,7 @@ indexToOffset( ...@@ -48,7 +48,7 @@ indexToOffset(
} }
return res; return res;
} }
} // namespace device::cuda } // namespace device::nvidia
__forceinline__ __device__ float __forceinline__ __device__ float
exp_(const float val) { exp_(const float val) {
......
...@@ -12,45 +12,45 @@ ...@@ -12,45 +12,45 @@
#include <numeric> #include <numeric>
#include <vector> #include <vector>
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON) \ #define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \
\ \
namespace op::OP::NAMESPACE { \ namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \ class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype; \ infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info; \ op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \ std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
size_t _workspace_size; \ size_t _workspace_size; \
\ \
Descriptor( \ Descriptor( \
infiniDtype_t dtype, \ infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info, \ op::elementwise::ElementwiseInfo info, \
op::elementwise::KERNEL_COMMON::DeviceImpl *device_info, \ op::elementwise::NAMESPACE::DeviceImpl *device_info, \
size_t workspace_size, \ size_t workspace_size, \
infiniDevice_t device_type, \ infiniDevice_t device_type, \
int device_id) \ int device_id) \
: InfiniopDescriptor{device_type, device_id}, \ : InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype), \ _dtype(dtype), \
_info(std::move(info)), \ _info(std::move(info)), \
_device_info(std::move(device_info)), \ _device_info(std::move(device_info)), \
_workspace_size(workspace_size) {} \ _workspace_size(workspace_size) {} \
\ \
public: \ public: \
~Descriptor(); \ ~Descriptor(); \
\ \
size_t workspaceSize() const { return _workspace_size; } \ size_t workspaceSize() const { return _workspace_size; } \
\ \
static infiniStatus_t create( \ static infiniStatus_t create( \
infiniopHandle_t handle, \ infiniopHandle_t handle, \
Descriptor **desc_ptr, \ Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \ infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs); \ std::vector<infiniopTensorDescriptor_t> input_descs); \
\ \
infiniStatus_t calculate( \ infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \ void *workspace, size_t workspace_size, \
void *output, \ void *output, \
std::vector<const void *> inputs, \ std::vector<const void *> inputs, \
void *stream) const; \ void *stream) const; \
}; \ }; \
} }
namespace op::elementwise { namespace op::elementwise {
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
#define __INFINIOP_ELEMENTWISE_CUDA_H__ #define __INFINIOP_ELEMENTWISE_CUDA_H__
#include "../../../utils.h" #include "../../../utils.h"
#include "../../devices/cuda/cuda_common.cuh" #include "../../devices/nvidia/nvidia_common.cuh"
#include "../../devices/cuda/cuda_kernel_common.cuh" #include "../../devices/nvidia/nvidia_kernel_common.cuh"
#include "elementwise_cuda_api.cuh" #include "elementwise_nvidia_api.cuh"
namespace op::elementwise::cuda { namespace op::elementwise::nvidia {
/** /**
* @brief Casts an untyped device pointer to a typed pointer of type T. * @brief Casts an untyped device pointer to a typed pointer of type T.
...@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) { ...@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
*/ */
__device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim, __device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim,
const size_t *shape, const ptrdiff_t *strides) { const size_t *shape, const ptrdiff_t *strides) {
return is_contiguous ? idx : device::cuda::indexToOffset(idx, ndim, shape, strides); return is_contiguous ? idx : device::nvidia::indexToOffset(idx, ndim, shape, strides);
} }
/** /**
...@@ -61,8 +61,8 @@ struct InputIndexer { ...@@ -61,8 +61,8 @@ struct InputIndexer {
return input_contiguous[input_id] return input_contiguous[input_id]
? idx ? idx
: (input_broadcasted[input_id] : (input_broadcasted[input_id]
? device::cuda::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim) ? device::nvidia::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
: device::cuda::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim)); : device::nvidia::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
} }
}; };
...@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel( ...@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
} }
struct DeviceImpl::Opaque { struct DeviceImpl::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::nvidia::Handle::Internal> internal;
Opaque(const std::shared_ptr<device::cuda::Handle::Internal> &internal) Opaque(const std::shared_ptr<device::nvidia::Handle::Internal> &internal)
: internal(internal) {} : internal(internal) {}
/** /**
...@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf ...@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
std::forward<Args>(args)...); std::forward<Args>(args)...);
} }
} // namespace op::elementwise::cuda } // namespace op::elementwise::nvidia
#endif // __INFINIOP_ELEMENTWISE_CUDA_H__ #endif // __INFINIOP_ELEMENTWISE_CUDA_H__
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../elementwise.h" #include "../elementwise.h"
namespace op::elementwise::cuda { namespace op::elementwise::nvidia {
/** /**
* @brief Define the methods and info needed by CUDA to perform elementwise operation * @brief Define the methods and info needed by CUDA to perform elementwise operation
...@@ -77,7 +77,7 @@ public: ...@@ -77,7 +77,7 @@ public:
void *stream, void *stream,
Args &&...args); Args &&...args);
}; };
} // namespace op::elementwise::cuda } // namespace op::elementwise::nvidia
/** /**
* @brief Define the process for initializing a Descriptor of an elementwise operation * @brief Define the process for initializing a Descriptor of an elementwise operation
...@@ -88,22 +88,22 @@ public: ...@@ -88,22 +88,22 @@ public:
* @param OUT_DESC The output tensor descriptor. * @param OUT_DESC The output tensor descriptor.
* @param INPUT_DESC_VEC A vector containing input tensor descriptors. * @param INPUT_DESC_VEC A vector containing input tensor descriptors.
*/ */
#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \ #define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
\ \
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \ auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
CHECK_RESULT(info_result); \ CHECK_RESULT(info_result); \
auto info = info_result.take(); \ auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \ auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\ \
auto device_impl_result = op::elementwise::cuda::DeviceImpl::create(HANDLE->internal()); \ auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \ CHECK_RESULT(device_impl_result); \
\ \
*desc_ptr = new Descriptor( \ *desc_ptr = new Descriptor( \
DTYPE, \ DTYPE, \
std::move(info), \ std::move(info), \
std::move(device_impl_result.take()), \ std::move(device_impl_result.take()), \
workspace_size, \ workspace_size, \
HANDLE->device, \ HANDLE->device, \
HANDLE->device_id); HANDLE->device_id);
#endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__ #endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h" #include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(add, cpu, cpu) ELEMENTWISE_DESCRIPTOR(add, cpu)
namespace op::add::cpu { namespace op::add::cpu {
typedef struct AddOp { typedef struct AddOp {
......
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
#include "../../../elementwise/metax/elementwise_metax_api.h" #include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(add, metax, metax) ELEMENTWISE_DESCRIPTOR(add, metax)
#endif // __ADD_METAX_API_H__ #endif // __ADD_METAX_API_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh" #include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh" #include "../cuda/kernel.cuh"
#include "add_nvidia.cuh" #include "add_nvidia.cuh"
...@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create( ...@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) { std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_); auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype(); auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0); const auto &a_desc = input_desc_vec.at(0);
......
#ifndef __ADD_CUDA_API_H__ #ifndef __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__ #define __ADD_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh" #include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda) ELEMENTWISE_DESCRIPTOR(add, nvidia)
#endif // __ADD_CUDA_API_H__ #endif // __ADD_CUDA_API_H__
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/nvidia/nvidia_common.cuh"
#include "causal_softmax_nvidia.cuh" #include "causal_softmax_nvidia.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh> #include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh" #include "../../../reduce/cuda/reduce.cuh"
...@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax( ...@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
namespace op::causal_softmax::nvidia { namespace op::causal_softmax::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::nvidia::Handle::Internal> internal;
}; };
Descriptor::~Descriptor() { Descriptor::~Descriptor() {
...@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create( ...@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create(
auto info = CausalSoftmaxInfo::create(y_desc, x_desc); auto info = CausalSoftmaxInfo::create(y_desc, x_desc);
CHECK_RESULT(info); CHECK_RESULT(info);
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()}, new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id); info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h" #include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h" #include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu) ELEMENTWISE_DESCRIPTOR(clip, cpu)
namespace op::clip::cpu { namespace op::clip::cpu {
......
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
#include "../../../elementwise/metax/elementwise_metax_api.h" #include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(clip, metax, metax) ELEMENTWISE_DESCRIPTOR(clip, metax)
#endif // __CLIP_METAX_API_H__ #endif // __CLIP_METAX_API_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh" #include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh" #include "../cuda/kernel.cuh"
#include "clip_nvidia.cuh" #include "clip_nvidia.cuh"
...@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create( ...@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) { std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_); auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype(); auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0); const auto &in_desc = input_desc_vec.at(0);
......
#ifndef __CLIP_CUDA_API_H__ #ifndef __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__ #define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh" #include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda) ELEMENTWISE_DESCRIPTOR(clip, nvidia)
#endif // __CLIP_CUDA_API_H__ #endif // __CLIP_CUDA_API_H__
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "../../tensor.h" #include "../../tensor.h"
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
#include "../../devices/cuda/cuda_handle.cuh" #include "../../devices/nvidia/nvidia_handle.cuh"
#endif #endif
namespace op::conv { namespace op::conv {
......
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/cuda/cuda_handle.cuh" #include "../../../devices/nvidia/nvidia_handle.cuh"
#include "conv_cuda.cuh" #include "conv_nvidia.cuh"
#ifdef ENABLE_CUDNN_API
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ #define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
do { \ do { \
...@@ -22,11 +20,13 @@ ...@@ -22,11 +20,13 @@
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \ DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
} while (0) } while (0)
namespace op::conv::cuda { namespace op::conv::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::nvidia::Handle::Internal> internal;
size_t workspace_size = 0;
#ifdef ENABLE_CUDNN_API
cudnnTensorDescriptor_t x_desc = nullptr; cudnnTensorDescriptor_t x_desc = nullptr;
cudnnTensorDescriptor_t y_desc = nullptr; cudnnTensorDescriptor_t y_desc = nullptr;
cudnnFilterDescriptor_t w_desc = nullptr; cudnnFilterDescriptor_t w_desc = nullptr;
...@@ -34,12 +34,13 @@ struct Descriptor::Opaque { ...@@ -34,12 +34,13 @@ struct Descriptor::Opaque {
cudnnActivationDescriptor_t act_desc = nullptr; cudnnActivationDescriptor_t act_desc = nullptr;
cudnnConvolutionDescriptor_t conv_desc = nullptr; cudnnConvolutionDescriptor_t conv_desc = nullptr;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
size_t workspace_size = 0; #endif
private: private:
Opaque(std::shared_ptr<device::cuda::Handle::Internal> internal_ptr) Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
: internal(internal_ptr) {} : internal(internal_ptr) {}
#ifdef ENABLE_CUDNN_API
void initializeDimensionArrays(const ConvInfo &info, void initializeDimensionArrays(const ConvInfo &info,
std::vector<int> &input_dims, std::vector<int> &input_dims,
std::vector<int> &output_dims, std::vector<int> &output_dims,
...@@ -108,11 +109,11 @@ private: ...@@ -108,11 +109,11 @@ private:
infiniStatus_t getCudnnDataType(infiniDtype_t data_type, infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
cudnnDataType_t &cudnn_data_type) const { cudnnDataType_t &cudnn_data_type) const {
if (data_type == INFINI_DTYPE_F16) { if (data_type == INFINI_DTYPE_F16) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type); cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
} else if (data_type == INFINI_DTYPE_F32) { } else if (data_type == INFINI_DTYPE_F32) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type); cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
} else if (data_type == INFINI_DTYPE_BF16) { } else if (data_type == INFINI_DTYPE_BF16) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type); cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
} else { } else {
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
...@@ -253,32 +254,42 @@ private: ...@@ -253,32 +254,42 @@ private:
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
#endif
public: public:
Opaque(Opaque &&other) noexcept Opaque(Opaque &&other) noexcept
: internal(std::move(other.internal)), : internal(std::move(other.internal)),
x_desc(other.x_desc), workspace_size(other.workspace_size)
y_desc(other.y_desc), // clang-format off
w_desc(other.w_desc), #ifdef ENABLE_CUDNN_API
b_desc(other.b_desc), , x_desc(other.x_desc),
act_desc(other.act_desc), , y_desc(other.y_desc),
conv_desc(other.conv_desc), , w_desc(other.w_desc),
algo(other.algo), , b_desc(other.b_desc),
workspace_size(other.workspace_size) { , act_desc(other.act_desc),
, conv_desc(other.conv_desc),
, algo(other.algo)
#endif
// clang-format on
{
#ifdef ENABLE_CUDNN_API
other.x_desc = nullptr; other.x_desc = nullptr;
other.y_desc = nullptr; other.y_desc = nullptr;
other.w_desc = nullptr; other.w_desc = nullptr;
other.b_desc = nullptr; other.b_desc = nullptr;
other.act_desc = nullptr; other.act_desc = nullptr;
other.conv_desc = nullptr; other.conv_desc = nullptr;
#endif
other.workspace_size = 0; other.workspace_size = 0;
} }
~Opaque() { ~Opaque() {
#ifdef ENABLE_CUDNN_API
CLEANUP_CUDNN_DESCRIPTORS(); CLEANUP_CUDNN_DESCRIPTORS();
#endif
} }
#ifdef ENABLE_CUDNN_API
infiniStatus_t initializeCudnnContext(ConvInfo &info, infiniStatus_t initializeCudnnContext(ConvInfo &info,
infiniDtype_t data_type, infiniDtype_t data_type,
cudnnDataType_t compute_type) { cudnnDataType_t compute_type) {
...@@ -320,17 +331,22 @@ public: ...@@ -320,17 +331,22 @@ public:
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
#endif
static inline utils::Result<Opaque> create( static inline utils::Result<Opaque> create(
std::shared_ptr<device::cuda::Handle::Internal> internal_ptr, std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
ConvInfo &info, ConvInfo &info,
infiniDtype_t data_type) { infiniDtype_t data_type) {
#ifdef ENABLE_CUDNN_API
Opaque opaque(internal_ptr); Opaque opaque(internal_ptr);
auto status = opaque.initializeCudnnContext(info, data_type, CUDNN_DATA_FLOAT); auto status = opaque.initializeCudnnContext(info, data_type, CUDNN_DATA_FLOAT);
if (status != INFINI_STATUS_SUCCESS) { if (status != INFINI_STATUS_SUCCESS) {
return status; return status;
} }
return utils::Result<Opaque>(std::move(opaque)); return utils::Result<Opaque>(std::move(opaque));
#else
return INFINI_STATUS_SUCCESS;
#endif
} }
}; };
...@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create( ...@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create(
const void *strides, const void *strides,
const void *dilations, const void *dilations,
size_t n) { size_t n) {
auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_); #ifdef ENABLE_CUDNN_API
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = y_desc->dtype(); auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
...@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create( ...@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create(
handle->device, handle->device,
handle->device_id); handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
#else
return INFINI_STATUS_SUCCESS;
#endif
} }
infiniStatus_t Descriptor::calculate( infiniStatus_t Descriptor::calculate(
...@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate(
const void *w, const void *w,
const void *bias, const void *bias,
void *stream) const { void *stream) const {
#ifdef ENABLE_CUDNN_API
const float alpha = 1.0f, beta = 0.0f; const float alpha = 1.0f, beta = 0.0f;
if (bias != nullptr) { if (bias != nullptr) {
CHECK_STATUS(_opaque->internal->useCudnn( CHECK_STATUS(_opaque->internal->useCudnn(
...@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate( ...@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate(
} }
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
#else
return INFINI_STATUS_SUCCESS;
#endif
} }
} // namespace op::conv::cuda } // namespace op::conv::nvidia
#endif // ENABLE_CUDNN_API
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment