Unverified Commit a9dbb97a authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #325 from YdrMaster/main

issue/291/style: 根据实际情况将 cuda 改为 nvidia
parents d76a2607 ceb57c2a
......@@ -6,7 +6,7 @@
#include "cpu/cpu_handle.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/cuda_handle.h"
#include "nvidia/nvidia_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/bang_handle.h"
......@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
......@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
......
#include "cuda_handle.cuh"
#include "nvidia_handle.cuh"
namespace device::cuda {
namespace device::nvidia {
Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id},
Handle::Handle(int device_id)
: InfiniopHandle{INFINI_DEVICE_NVIDIA, device_id},
_internal(std::make_shared<Handle::Internal>(device_id)) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & {
......@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
}
#endif
namespace nvidia {
Handle::Handle(int device_id)
: cuda::Handle(INFINI_DEVICE_NVIDIA, device_id) {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(device_id);
return INFINI_STATUS_SUCCESS;
}
} // namespace nvidia
} // namespace device::cuda
} // namespace device::nvidia
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h"
#include "nvidia_handle.cuh"
namespace device::cuda {
namespace device::nvidia {
#ifdef ENABLE_CUDNN_API
cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
#endif
} // namespace device::cuda
} // namespace device::nvidia
#endif // __INFINIOP_CUDA_COMMON_CUH__
......@@ -3,7 +3,7 @@
#include "../../../utils.h"
#include "../pool.h"
#include "cuda_handle.h"
#include "nvidia_handle.h"
#include <cublas_v2.h>
#include <functional>
......@@ -14,7 +14,7 @@
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace device::cuda {
namespace device::nvidia {
class Handle::Internal {
Pool<cublasHandle_t> blas_handles;
......@@ -48,6 +48,6 @@ public:
int gridSizeZ() const;
};
} // namespace device::cuda
} // namespace device::nvidia
#endif // __INFINIOP_CUDA_HANDLE_CUH__
......@@ -4,30 +4,20 @@
#include "../../handle.h"
#include <memory>
namespace device::cuda {
namespace device::nvidia {
struct Handle : public InfiniopHandle {
Handle(int device_id);
class Internal;
auto internal() const -> const std::shared_ptr<Internal> &;
protected:
Handle(infiniDevice_t device, int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
private:
std::shared_ptr<Internal> _internal;
};
namespace nvidia {
class Handle : public cuda::Handle {
Handle(int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
};
} // namespace nvidia
} // namespace device::cuda
} // namespace device::nvidia
#endif // __INFINIOP_CUDA_HANDLE_H__
......@@ -18,7 +18,7 @@
using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
namespace device::cuda {
namespace device::nvidia {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
......@@ -48,7 +48,7 @@ indexToOffset(
}
return res;
}
} // namespace device::cuda
} // namespace device::nvidia
__forceinline__ __device__ float
exp_(const float val) {
......
......@@ -12,45 +12,45 @@
#include <numeric>
#include <vector>
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON) \
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info, \
op::elementwise::KERNEL_COMMON::DeviceImpl *device_info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype), \
_info(std::move(info)), \
_device_info(std::move(device_info)), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
std::vector<const void *> inputs, \
void *stream) const; \
}; \
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info, \
op::elementwise::NAMESPACE::DeviceImpl *device_info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype), \
_info(std::move(info)), \
_device_info(std::move(device_info)), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
std::vector<const void *> inputs, \
void *stream) const; \
}; \
}
namespace op::elementwise {
......
......@@ -2,11 +2,11 @@
#define __INFINIOP_ELEMENTWISE_CUDA_H__
#include "../../../utils.h"
#include "../../devices/cuda/cuda_common.cuh"
#include "../../devices/cuda/cuda_kernel_common.cuh"
#include "elementwise_cuda_api.cuh"
#include "../../devices/nvidia/nvidia_common.cuh"
#include "../../devices/nvidia/nvidia_kernel_common.cuh"
#include "elementwise_nvidia_api.cuh"
namespace op::elementwise::cuda {
namespace op::elementwise::nvidia {
/**
* @brief Casts an untyped device pointer to a typed pointer of type T.
......@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
*/
__device__ __forceinline__ size_t getOutputIndex(size_t idx, bool is_contiguous, size_t ndim,
const size_t *shape, const ptrdiff_t *strides) {
return is_contiguous ? idx : device::cuda::indexToOffset(idx, ndim, shape, strides);
return is_contiguous ? idx : device::nvidia::indexToOffset(idx, ndim, shape, strides);
}
/**
......@@ -61,8 +61,8 @@ struct InputIndexer {
return input_contiguous[input_id]
? idx
: (input_broadcasted[input_id]
? device::cuda::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
: device::cuda::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
? device::nvidia::indexToReducedOffset(idx, ndim, output_strides, input_strides + input_id * ndim)
: device::nvidia::indexToOffset(idx, ndim, input_shapes + input_id * ndim, input_strides + input_id * ndim));
}
};
......@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
}
struct DeviceImpl::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
std::shared_ptr<device::nvidia::Handle::Internal> internal;
Opaque(const std::shared_ptr<device::cuda::Handle::Internal> &internal)
Opaque(const std::shared_ptr<device::nvidia::Handle::Internal> &internal)
: internal(internal) {}
/**
......@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
std::forward<Args>(args)...);
}
} // namespace op::elementwise::cuda
} // namespace op::elementwise::nvidia
#endif // __INFINIOP_ELEMENTWISE_CUDA_H__
......@@ -3,7 +3,7 @@
#include "../elementwise.h"
namespace op::elementwise::cuda {
namespace op::elementwise::nvidia {
/**
* @brief Define the methods and info needed by CUDA to perform elementwise operation
......@@ -77,7 +77,7 @@ public:
void *stream,
Args &&...args);
};
} // namespace op::elementwise::cuda
} // namespace op::elementwise::nvidia
/**
* @brief Define the process for initializing a Descriptor of an elementwise operation
......@@ -88,22 +88,22 @@ public:
* @param OUT_DESC The output tensor descriptor.
* @param INPUT_DESC_VEC A vector containing input tensor descriptors.
*/
#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
\
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
CHECK_RESULT(info_result); \
auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\
auto device_impl_result = op::elementwise::cuda::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
\
*desc_ptr = new Descriptor( \
DTYPE, \
std::move(info), \
std::move(device_impl_result.take()), \
workspace_size, \
HANDLE->device, \
#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
\
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
CHECK_RESULT(info_result); \
auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\
auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
\
*desc_ptr = new Descriptor( \
DTYPE, \
std::move(info), \
std::move(device_impl_result.take()), \
workspace_size, \
HANDLE->device, \
HANDLE->device_id);
#endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
......@@ -3,7 +3,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)
ELEMENTWISE_DESCRIPTOR(add, cpu)
namespace op::add::cpu {
typedef struct AddOp {
......
......@@ -3,6 +3,6 @@
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(add, metax, metax)
ELEMENTWISE_DESCRIPTOR(add, metax)
#endif // __ADD_METAX_API_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "add_nvidia.cuh"
......@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
......
#ifndef __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)
ELEMENTWISE_DESCRIPTOR(add, nvidia)
#endif // __ADD_CUDA_API_H__
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "causal_softmax_nvidia.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
......@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
namespace op::causal_softmax::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
......@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create(
auto info = CausalSoftmaxInfo::create(y_desc, x_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
......
......@@ -4,7 +4,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu)
ELEMENTWISE_DESCRIPTOR(clip, cpu)
namespace op::clip::cpu {
......
......@@ -3,6 +3,6 @@
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(clip, metax, metax)
ELEMENTWISE_DESCRIPTOR(clip, metax)
#endif // __CLIP_METAX_API_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "clip_nvidia.cuh"
......@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
......
#ifndef __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda)
ELEMENTWISE_DESCRIPTOR(clip, nvidia)
#endif // __CLIP_CUDA_API_H__
......@@ -6,7 +6,7 @@
#include "../../tensor.h"
#ifdef ENABLE_CUDA_API
#include "../../devices/cuda/cuda_handle.cuh"
#include "../../devices/nvidia/nvidia_handle.cuh"
#endif
namespace op::conv {
......
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_handle.cuh"
#include "conv_cuda.cuh"
#ifdef ENABLE_CUDNN_API
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "conv_nvidia.cuh"
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
do { \
......@@ -22,11 +20,13 @@
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
} while (0)
namespace op::conv::cuda {
namespace op::conv::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
std::shared_ptr<device::nvidia::Handle::Internal> internal;
size_t workspace_size = 0;
#ifdef ENABLE_CUDNN_API
cudnnTensorDescriptor_t x_desc = nullptr;
cudnnTensorDescriptor_t y_desc = nullptr;
cudnnFilterDescriptor_t w_desc = nullptr;
......@@ -34,12 +34,13 @@ struct Descriptor::Opaque {
cudnnActivationDescriptor_t act_desc = nullptr;
cudnnConvolutionDescriptor_t conv_desc = nullptr;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
size_t workspace_size = 0;
#endif
private:
Opaque(std::shared_ptr<device::cuda::Handle::Internal> internal_ptr)
Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
: internal(internal_ptr) {}
#ifdef ENABLE_CUDNN_API
void initializeDimensionArrays(const ConvInfo &info,
std::vector<int> &input_dims,
std::vector<int> &output_dims,
......@@ -108,11 +109,11 @@ private:
infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
cudnnDataType_t &cudnn_data_type) const {
if (data_type == INFINI_DTYPE_F16) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type);
cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
} else if (data_type == INFINI_DTYPE_F32) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type);
cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
} else if (data_type == INFINI_DTYPE_BF16) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type);
cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
......@@ -253,32 +254,42 @@ private:
return INFINI_STATUS_SUCCESS;
}
#endif
public:
Opaque(Opaque &&other) noexcept
: internal(std::move(other.internal)),
x_desc(other.x_desc),
y_desc(other.y_desc),
w_desc(other.w_desc),
b_desc(other.b_desc),
act_desc(other.act_desc),
conv_desc(other.conv_desc),
algo(other.algo),
workspace_size(other.workspace_size) {
workspace_size(other.workspace_size)
// clang-format off
#ifdef ENABLE_CUDNN_API
, x_desc(other.x_desc),
, y_desc(other.y_desc),
, w_desc(other.w_desc),
, b_desc(other.b_desc),
, act_desc(other.act_desc),
, conv_desc(other.conv_desc),
, algo(other.algo)
#endif
// clang-format on
{
#ifdef ENABLE_CUDNN_API
other.x_desc = nullptr;
other.y_desc = nullptr;
other.w_desc = nullptr;
other.b_desc = nullptr;
other.act_desc = nullptr;
other.conv_desc = nullptr;
#endif
other.workspace_size = 0;
}
~Opaque() {
#ifdef ENABLE_CUDNN_API
CLEANUP_CUDNN_DESCRIPTORS();
#endif
}
#ifdef ENABLE_CUDNN_API
infiniStatus_t initializeCudnnContext(ConvInfo &info,
infiniDtype_t data_type,
cudnnDataType_t compute_type) {
......@@ -320,17 +331,22 @@ public:
return INFINI_STATUS_SUCCESS;
}
#endif
static inline utils::Result<Opaque> create(
std::shared_ptr<device::cuda::Handle::Internal> internal_ptr,
std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
ConvInfo &info,
infiniDtype_t data_type) {
#ifdef ENABLE_CUDNN_API
Opaque opaque(internal_ptr);
auto status = opaque.initializeCudnnContext(info, data_type, CUDNN_DATA_FLOAT);
if (status != INFINI_STATUS_SUCCESS) {
return status;
}
return utils::Result<Opaque>(std::move(opaque));
#else
return INFINI_STATUS_SUCCESS;
#endif
}
};
......@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create(
const void *strides,
const void *dilations,
size_t n) {
auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_);
#ifdef ENABLE_CUDNN_API
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
......@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create(
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
#else
return INFINI_STATUS_SUCCESS;
#endif
}
infiniStatus_t Descriptor::calculate(
......@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate(
const void *w,
const void *bias,
void *stream) const {
#ifdef ENABLE_CUDNN_API
const float alpha = 1.0f, beta = 0.0f;
if (bias != nullptr) {
CHECK_STATUS(_opaque->internal->useCudnn(
......@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate(
}
return INFINI_STATUS_SUCCESS;
#else
return INFINI_STATUS_SUCCESS;
#endif
}
} // namespace op::conv::cuda
#endif // ENABLE_CUDNN_API
} // namespace op::conv::nvidia
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment