Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
ceb57c2a
Commit
ceb57c2a
authored
Jul 14, 2025
by
YdrMaster
Browse files
issue/291/style: 根据实际情况将 cuda 改为 nvidia
Signed-off-by:
YdrMaster
<
ydrml@hotmail.com
>
parent
d76a2607
Changes
49
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
153 additions
and
148 deletions
+153
-148
src/infiniop/devices/handle.cc
src/infiniop/devices/handle.cc
+3
-3
src/infiniop/devices/nvidia/nvidia_common.cu
src/infiniop/devices/nvidia/nvidia_common.cu
+5
-12
src/infiniop/devices/nvidia/nvidia_common.cuh
src/infiniop/devices/nvidia/nvidia_common.cuh
+3
-3
src/infiniop/devices/nvidia/nvidia_handle.cuh
src/infiniop/devices/nvidia/nvidia_handle.cuh
+3
-3
src/infiniop/devices/nvidia/nvidia_handle.h
src/infiniop/devices/nvidia/nvidia_handle.h
+5
-15
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
+2
-2
src/infiniop/elementwise/elementwise.h
src/infiniop/elementwise/elementwise.h
+39
-39
src/infiniop/elementwise/nvidia/elementwise_nvidia.cuh
src/infiniop/elementwise/nvidia/elementwise_nvidia.cuh
+10
-10
src/infiniop/elementwise/nvidia/elementwise_nvidia_api.cuh
src/infiniop/elementwise/nvidia/elementwise_nvidia_api.cuh
+18
-18
src/infiniop/ops/add/cpu/add_cpu.h
src/infiniop/ops/add/cpu/add_cpu.h
+1
-1
src/infiniop/ops/add/metax/add_metax.h
src/infiniop/ops/add/metax/add_metax.h
+1
-1
src/infiniop/ops/add/nvidia/add_nvidia.cu
src/infiniop/ops/add/nvidia/add_nvidia.cu
+2
-2
src/infiniop/ops/add/nvidia/add_nvidia.cuh
src/infiniop/ops/add/nvidia/add_nvidia.cuh
+2
-2
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
...finiop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+4
-4
src/infiniop/ops/clip/cpu/clip_cpu.h
src/infiniop/ops/clip/cpu/clip_cpu.h
+1
-1
src/infiniop/ops/clip/metax/clip_metax.h
src/infiniop/ops/clip/metax/clip_metax.h
+1
-1
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
+2
-2
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
+2
-2
src/infiniop/ops/conv/info.h
src/infiniop/ops/conv/info.h
+1
-1
src/infiniop/ops/conv/nvidia/conv_nvidia.cu
src/infiniop/ops/conv/nvidia/conv_nvidia.cu
+48
-26
No files found.
src/infiniop/devices/handle.cc
View file @
ceb57c2a
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include "cpu/cpu_handle.h"
#include "cpu/cpu_handle.h"
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "
cuda/cud
a_handle.h"
#include "
nvidia/nvidi
a_handle.h"
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
#ifdef ENABLE_CAMBRICON_API
#include "bang/bang_handle.h"
#include "bang/bang_handle.h"
...
@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
...
@@ -42,7 +42,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
::
nvidia
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
::
cambricon
);
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
::
cambricon
);
...
@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
...
@@ -79,7 +79,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
::
nvidia
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
::
cambricon
);
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
::
cambricon
);
...
...
src/infiniop/devices/
cuda/cud
a_common.cu
→
src/infiniop/devices/
nvidia/nvidi
a_common.cu
View file @
ceb57c2a
#include "
cud
a_handle.cuh"
#include "
nvidi
a_handle.cuh"
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
Handle
::
Handle
(
infiniDevice_t
device
,
int
device_id
)
Handle
::
Handle
(
int
device_id
)
:
InfiniopHandle
{
device
,
device_id
},
:
InfiniopHandle
{
INFINI_DEVICE_NVIDIA
,
device_id
},
_internal
(
std
::
make_shared
<
Handle
::
Internal
>
(
device_id
))
{}
_internal
(
std
::
make_shared
<
Handle
::
Internal
>
(
device_id
))
{}
auto
Handle
::
internal
()
const
->
const
std
::
shared_ptr
<
Internal
>
&
{
auto
Handle
::
internal
()
const
->
const
std
::
shared_ptr
<
Internal
>
&
{
...
@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
...
@@ -83,16 +83,9 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
}
}
#endif
#endif
namespace
nvidia
{
Handle
::
Handle
(
int
device_id
)
:
cuda
::
Handle
(
INFINI_DEVICE_NVIDIA
,
device_id
)
{}
infiniStatus_t
Handle
::
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
)
{
infiniStatus_t
Handle
::
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
)
{
*
handle_ptr
=
new
Handle
(
device_id
);
*
handle_ptr
=
new
Handle
(
device_id
);
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace nvidia
}
// namespace device::nvidia
}
// namespace device::cuda
src/infiniop/devices/
cuda/cud
a_common.cuh
→
src/infiniop/devices/
nvidia/nvidi
a_common.cuh
View file @
ceb57c2a
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h"
#include "infinicore.h"
#include "nvidia_handle.cuh"
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
#ifdef ENABLE_CUDNN_API
#ifdef ENABLE_CUDNN_API
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
);
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
);
#endif
#endif
}
// namespace device::
cud
a
}
// namespace device::
nvidi
a
#endif // __INFINIOP_CUDA_COMMON_CUH__
#endif // __INFINIOP_CUDA_COMMON_CUH__
src/infiniop/devices/
cuda/cud
a_handle.cuh
→
src/infiniop/devices/
nvidia/nvidi
a_handle.cuh
View file @
ceb57c2a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "../../../utils.h"
#include "../../../utils.h"
#include "../pool.h"
#include "../pool.h"
#include "
cud
a_handle.h"
#include "
nvidi
a_handle.h"
#include <cublas_v2.h>
#include <cublas_v2.h>
#include <functional>
#include <functional>
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
class
Handle
::
Internal
{
class
Handle
::
Internal
{
Pool
<
cublasHandle_t
>
blas_handles
;
Pool
<
cublasHandle_t
>
blas_handles
;
...
@@ -48,6 +48,6 @@ public:
...
@@ -48,6 +48,6 @@ public:
int
gridSizeZ
()
const
;
int
gridSizeZ
()
const
;
};
};
}
// namespace device::
cud
a
}
// namespace device::
nvidi
a
#endif // __INFINIOP_CUDA_HANDLE_CUH__
#endif // __INFINIOP_CUDA_HANDLE_CUH__
src/infiniop/devices/
cuda/cud
a_handle.h
→
src/infiniop/devices/
nvidia/nvidi
a_handle.h
View file @
ceb57c2a
...
@@ -4,30 +4,20 @@
...
@@ -4,30 +4,20 @@
#include "../../handle.h"
#include "../../handle.h"
#include <memory>
#include <memory>
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
struct
Handle
:
public
InfiniopHandle
{
struct
Handle
:
public
InfiniopHandle
{
Handle
(
int
device_id
);
class
Internal
;
class
Internal
;
auto
internal
()
const
->
const
std
::
shared_ptr
<
Internal
>
&
;
auto
internal
()
const
->
const
std
::
shared_ptr
<
Internal
>
&
;
p
rotected
:
p
ublic
:
Handle
(
infiniDevice_t
device
,
int
device_id
);
static
infiniStatus_t
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
);
private:
private:
std
::
shared_ptr
<
Internal
>
_internal
;
std
::
shared_ptr
<
Internal
>
_internal
;
};
};
namespace
nvidia
{
}
// namespace device::nvidia
class
Handle
:
public
cuda
::
Handle
{
Handle
(
int
device_id
);
public:
static
infiniStatus_t
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
);
};
}
// namespace nvidia
}
// namespace device::cuda
#endif // __INFINIOP_CUDA_HANDLE_H__
#endif // __INFINIOP_CUDA_HANDLE_H__
src/infiniop/devices/
cuda/cud
a_kernel_common.cuh
→
src/infiniop/devices/
nvidia/nvidi
a_kernel_common.cuh
View file @
ceb57c2a
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
using
cuda_bfloat16
=
nv_bfloat16
;
using
cuda_bfloat16
=
nv_bfloat16
;
using
cuda_bfloat162
=
nv_bfloat162
;
using
cuda_bfloat162
=
nv_bfloat162
;
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
indexToReducedOffset
(
...
@@ -48,7 +48,7 @@ indexToOffset(
...
@@ -48,7 +48,7 @@ indexToOffset(
}
}
return
res
;
return
res
;
}
}
}
// namespace device::
cud
a
}
// namespace device::
nvidi
a
__forceinline__
__device__
float
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
exp_
(
const
float
val
)
{
...
...
src/infiniop/elementwise/elementwise.h
View file @
ceb57c2a
...
@@ -12,45 +12,45 @@
...
@@ -12,45 +12,45 @@
#include <numeric>
#include <numeric>
#include <vector>
#include <vector>
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE
, KERNEL_COMMON)
\
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE
)
\
\
\
namespace op::OP::NAMESPACE {
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor {
\
class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype;
\
infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info;
\
op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::
KERNEL_COMMON
::DeviceImpl> _device_info; \
std::unique_ptr<op::elementwise::
NAMESPACE
::DeviceImpl> _device_info; \
size_t _workspace_size;
\
size_t _workspace_size; \
\
\
Descriptor(
\
Descriptor( \
infiniDtype_t dtype,
\
infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info,
\
op::elementwise::ElementwiseInfo info, \
op::elementwise::
KERNEL_COMMON
::DeviceImpl *device_info, \
op::elementwise::
NAMESPACE
::DeviceImpl *device_info, \
size_t workspace_size,
\
size_t workspace_size, \
infiniDevice_t device_type,
\
infiniDevice_t device_type, \
int device_id)
\
int device_id) \
: InfiniopDescriptor{device_type, device_id},
\
: InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype),
\
_dtype(dtype), \
_info(std::move(info)),
\
_info(std::move(info)), \
_device_info(std::move(device_info)),
\
_device_info(std::move(device_info)), \
_workspace_size(workspace_size) {}
\
_workspace_size(workspace_size) {} \
\
\
public:
\
public: \
~Descriptor();
\
~Descriptor(); \
\
\
size_t workspaceSize() const { return _workspace_size; }
\
size_t workspaceSize() const { return _workspace_size; } \
\
\
static infiniStatus_t create(
\
static infiniStatus_t create( \
infiniopHandle_t handle,
\
infiniopHandle_t handle, \
Descriptor **desc_ptr,
\
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc,
\
infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs);
\
std::vector<infiniopTensorDescriptor_t> input_descs); \
\
\
infiniStatus_t calculate(
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size,
\
void *workspace, size_t workspace_size, \
void *output,
\
void *output, \
std::vector<const void *> inputs,
\
std::vector<const void *> inputs, \
void *stream) const;
\
void *stream) const; \
};
\
}; \
}
}
namespace
op
::
elementwise
{
namespace
op
::
elementwise
{
...
...
src/infiniop/elementwise/
cud
a/elementwise_
cud
a.cuh
→
src/infiniop/elementwise/
nvidi
a/elementwise_
nvidi
a.cuh
View file @
ceb57c2a
...
@@ -2,11 +2,11 @@
...
@@ -2,11 +2,11 @@
#define __INFINIOP_ELEMENTWISE_CUDA_H__
#define __INFINIOP_ELEMENTWISE_CUDA_H__
#include "../../../utils.h"
#include "../../../utils.h"
#include "../../devices/
cuda/cud
a_common.cuh"
#include "../../devices/
nvidia/nvidi
a_common.cuh"
#include "../../devices/
cuda/cud
a_kernel_common.cuh"
#include "../../devices/
nvidia/nvidi
a_kernel_common.cuh"
#include "elementwise_
cud
a_api.cuh"
#include "elementwise_
nvidi
a_api.cuh"
namespace
op
::
elementwise
::
cud
a
{
namespace
op
::
elementwise
::
nvidi
a
{
/**
/**
* @brief Casts an untyped device pointer to a typed pointer of type T.
* @brief Casts an untyped device pointer to a typed pointer of type T.
...
@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
...
@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
*/
*/
__device__
__forceinline__
size_t
getOutputIndex
(
size_t
idx
,
bool
is_contiguous
,
size_t
ndim
,
__device__
__forceinline__
size_t
getOutputIndex
(
size_t
idx
,
bool
is_contiguous
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
return
is_contiguous
?
idx
:
device
::
cud
a
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
return
is_contiguous
?
idx
:
device
::
nvidi
a
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
}
}
/**
/**
...
@@ -61,8 +61,8 @@ struct InputIndexer {
...
@@ -61,8 +61,8 @@ struct InputIndexer {
return
input_contiguous
[
input_id
]
return
input_contiguous
[
input_id
]
?
idx
?
idx
:
(
input_broadcasted
[
input_id
]
:
(
input_broadcasted
[
input_id
]
?
device
::
cud
a
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
?
device
::
nvidi
a
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
cud
a
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
:
device
::
nvidi
a
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
}
}
};
};
...
@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
...
@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
}
}
struct
DeviceImpl
::
Opaque
{
struct
DeviceImpl
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
Opaque
(
const
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
&
internal
)
Opaque
(
const
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
&
internal
)
:
internal
(
internal
)
{}
:
internal
(
internal
)
{}
/**
/**
...
@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
...
@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
std
::
forward
<
Args
>
(
args
)...);
std
::
forward
<
Args
>
(
args
)...);
}
}
}
// namespace op::elementwise::
cud
a
}
// namespace op::elementwise::
nvidi
a
#endif // __INFINIOP_ELEMENTWISE_CUDA_H__
#endif // __INFINIOP_ELEMENTWISE_CUDA_H__
src/infiniop/elementwise/
cud
a/elementwise_
cud
a_api.cuh
→
src/infiniop/elementwise/
nvidi
a/elementwise_
nvidi
a_api.cuh
View file @
ceb57c2a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "../elementwise.h"
#include "../elementwise.h"
namespace
op
::
elementwise
::
cud
a
{
namespace
op
::
elementwise
::
nvidi
a
{
/**
/**
* @brief Define the methods and info needed by CUDA to perform elementwise operation
* @brief Define the methods and info needed by CUDA to perform elementwise operation
...
@@ -77,7 +77,7 @@ public:
...
@@ -77,7 +77,7 @@ public:
void
*
stream
,
void
*
stream
,
Args
&&
...
args
);
Args
&&
...
args
);
};
};
}
// namespace op::elementwise::
cud
a
}
// namespace op::elementwise::
nvidi
a
/**
/**
* @brief Define the process for initializing a Descriptor of an elementwise operation
* @brief Define the process for initializing a Descriptor of an elementwise operation
...
@@ -88,22 +88,22 @@ public:
...
@@ -88,22 +88,22 @@ public:
* @param OUT_DESC The output tensor descriptor.
* @param OUT_DESC The output tensor descriptor.
* @param INPUT_DESC_VEC A vector containing input tensor descriptors.
* @param INPUT_DESC_VEC A vector containing input tensor descriptors.
*/
*/
#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
#define CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC)
\
\
\
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC);
\
CHECK_RESULT(info_result); \
CHECK_RESULT(info_result);
\
auto info = info_result.take(); \
auto info = info_result.take();
\
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
\
\
\
auto device_impl_result = op::elementwise::
cud
a::DeviceImpl::create(HANDLE->internal()); \
auto device_impl_result = op::elementwise::
nvidi
a::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
CHECK_RESULT(device_impl_result);
\
\
\
*desc_ptr = new Descriptor( \
*desc_ptr = new Descriptor(
\
DTYPE, \
DTYPE,
\
std::move(info), \
std::move(info),
\
std::move(device_impl_result.take()), \
std::move(device_impl_result.take()),
\
workspace_size, \
workspace_size,
\
HANDLE->device, \
HANDLE->device,
\
HANDLE->device_id);
HANDLE->device_id);
#endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
#endif // __INFINIOP_ELEMENTWISE_CUDA_API_H__
src/infiniop/ops/add/cpu/add_cpu.h
View file @
ceb57c2a
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
add
,
cpu
,
cpu
)
ELEMENTWISE_DESCRIPTOR
(
add
,
cpu
)
namespace
op
::
add
::
cpu
{
namespace
op
::
add
::
cpu
{
typedef
struct
AddOp
{
typedef
struct
AddOp
{
...
...
src/infiniop/ops/add/metax/add_metax.h
View file @
ceb57c2a
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
#include "../../../elementwise/metax/elementwise_metax_api.h"
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
add
,
metax
,
metax
)
ELEMENTWISE_DESCRIPTOR
(
add
,
metax
)
#endif // __ADD_METAX_API_H__
#endif // __ADD_METAX_API_H__
src/infiniop/ops/add/nvidia/add_nvidia.cu
View file @
ceb57c2a
#include "../../../elementwise/
cud
a/elementwise_
cud
a.cuh"
#include "../../../elementwise/
nvidi
a/elementwise_
nvidi
a.cuh"
#include "../cuda/kernel.cuh"
#include "../cuda/kernel.cuh"
#include "add_nvidia.cuh"
#include "add_nvidia.cuh"
...
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
...
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
...
...
src/infiniop/ops/add/nvidia/add_nvidia.cuh
View file @
ceb57c2a
#ifndef __ADD_CUDA_API_H__
#ifndef __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__
#include "../../../elementwise/
cud
a/elementwise_
cud
a_api.cuh"
#include "../../../elementwise/
nvidi
a/elementwise_
nvidi
a_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
add
,
nvidia
,
cuda
)
ELEMENTWISE_DESCRIPTOR
(
add
,
nvidia
)
#endif // __ADD_CUDA_API_H__
#endif // __ADD_CUDA_API_H__
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
View file @
ceb57c2a
#include "../../../devices/
cuda/cud
a_common.cuh"
#include "../../../devices/
nvidia/nvidi
a_common.cuh"
#include "causal_softmax_nvidia.cuh"
#include "causal_softmax_nvidia.cuh"
#include "../../../devices/
cuda/cud
a_kernel_common.cuh"
#include "../../../devices/
nvidia/nvidi
a_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../../../reduce/cuda/reduce.cuh"
...
@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
...
@@ -20,7 +20,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
namespace
op
::
causal_softmax
::
nvidia
{
namespace
op
::
causal_softmax
::
nvidia
{
struct
Descriptor
::
Opaque
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
};
Descriptor
::~
Descriptor
()
{
Descriptor
::~
Descriptor
()
{
...
@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create(
...
@@ -35,7 +35,7 @@ infiniStatus_t Descriptor::create(
auto
info
=
CausalSoftmaxInfo
::
create
(
y_desc
,
x_desc
);
auto
info
=
CausalSoftmaxInfo
::
create
(
y_desc
,
x_desc
);
CHECK_RESULT
(
info
);
CHECK_RESULT
(
info
);
*
desc_ptr
=
new
Descriptor
(
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle
)
->
internal
()},
new
Opaque
{
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle
)
->
internal
()},
info
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
info
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
...
src/infiniop/ops/clip/cpu/clip_cpu.h
View file @
ceb57c2a
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
cpu
,
cpu
)
ELEMENTWISE_DESCRIPTOR
(
clip
,
cpu
)
namespace
op
::
clip
::
cpu
{
namespace
op
::
clip
::
cpu
{
...
...
src/infiniop/ops/clip/metax/clip_metax.h
View file @
ceb57c2a
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
#include "../../../elementwise/metax/elementwise_metax_api.h"
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
metax
,
metax
)
ELEMENTWISE_DESCRIPTOR
(
clip
,
metax
)
#endif // __CLIP_METAX_API_H__
#endif // __CLIP_METAX_API_H__
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
View file @
ceb57c2a
#include "../../../elementwise/
cud
a/elementwise_
cud
a.cuh"
#include "../../../elementwise/
nvidi
a/elementwise_
nvidi
a.cuh"
#include "../cuda/kernel.cuh"
#include "../cuda/kernel.cuh"
#include "clip_nvidia.cuh"
#include "clip_nvidia.cuh"
...
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
...
@@ -13,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
in_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
in_desc
=
input_desc_vec
.
at
(
0
);
...
...
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
View file @
ceb57c2a
#ifndef __CLIP_CUDA_API_H__
#ifndef __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#include "../../../elementwise/
cud
a/elementwise_
cud
a_api.cuh"
#include "../../../elementwise/
nvidi
a/elementwise_
nvidi
a_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
clip
,
nvidia
,
cuda
)
ELEMENTWISE_DESCRIPTOR
(
clip
,
nvidia
)
#endif // __CLIP_CUDA_API_H__
#endif // __CLIP_CUDA_API_H__
src/infiniop/ops/conv/info.h
View file @
ceb57c2a
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include "../../tensor.h"
#include "../../tensor.h"
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
#include "../../devices/
cuda/cud
a_handle.cuh"
#include "../../devices/
nvidia/nvidi
a_handle.cuh"
#endif
#endif
namespace
op
::
conv
{
namespace
op
::
conv
{
...
...
src/infiniop/ops/conv/
cud
a/conv_
cud
a.cu
→
src/infiniop/ops/conv/
nvidi
a/conv_
nvidi
a.cu
View file @
ceb57c2a
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/cuda/cuda_handle.cuh"
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "conv_cuda.cuh"
#include "conv_nvidia.cuh"
#ifdef ENABLE_CUDNN_API
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
do { \
do { \
...
@@ -22,11 +20,13 @@
...
@@ -22,11 +20,13 @@
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
} while (0)
} while (0)
namespace
op
::
conv
::
cud
a
{
namespace
op
::
conv
::
nvidi
a
{
struct
Descriptor
::
Opaque
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
size_t
workspace_size
=
0
;
#ifdef ENABLE_CUDNN_API
cudnnTensorDescriptor_t
x_desc
=
nullptr
;
cudnnTensorDescriptor_t
x_desc
=
nullptr
;
cudnnTensorDescriptor_t
y_desc
=
nullptr
;
cudnnTensorDescriptor_t
y_desc
=
nullptr
;
cudnnFilterDescriptor_t
w_desc
=
nullptr
;
cudnnFilterDescriptor_t
w_desc
=
nullptr
;
...
@@ -34,12 +34,13 @@ struct Descriptor::Opaque {
...
@@ -34,12 +34,13 @@ struct Descriptor::Opaque {
cudnnActivationDescriptor_t
act_desc
=
nullptr
;
cudnnActivationDescriptor_t
act_desc
=
nullptr
;
cudnnConvolutionDescriptor_t
conv_desc
=
nullptr
;
cudnnConvolutionDescriptor_t
conv_desc
=
nullptr
;
cudnnConvolutionFwdAlgo_t
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
cudnnConvolutionFwdAlgo_t
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
size_t
workspace_size
=
0
;
#endif
private:
private:
Opaque
(
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal_ptr
)
Opaque
(
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal_ptr
)
:
internal
(
internal_ptr
)
{}
:
internal
(
internal_ptr
)
{}
#ifdef ENABLE_CUDNN_API
void
initializeDimensionArrays
(
const
ConvInfo
&
info
,
void
initializeDimensionArrays
(
const
ConvInfo
&
info
,
std
::
vector
<
int
>
&
input_dims
,
std
::
vector
<
int
>
&
input_dims
,
std
::
vector
<
int
>
&
output_dims
,
std
::
vector
<
int
>
&
output_dims
,
...
@@ -108,11 +109,11 @@ private:
...
@@ -108,11 +109,11 @@ private:
infiniStatus_t
getCudnnDataType
(
infiniDtype_t
data_type
,
infiniStatus_t
getCudnnDataType
(
infiniDtype_t
data_type
,
cudnnDataType_t
&
cudnn_data_type
)
const
{
cudnnDataType_t
&
cudnn_data_type
)
const
{
if
(
data_type
==
INFINI_DTYPE_F16
)
{
if
(
data_type
==
INFINI_DTYPE_F16
)
{
cudnn_data_type
=
device
::
cud
a
::
getCudnnDtype
(
data_type
);
cudnn_data_type
=
device
::
nvidi
a
::
getCudnnDtype
(
data_type
);
}
else
if
(
data_type
==
INFINI_DTYPE_F32
)
{
}
else
if
(
data_type
==
INFINI_DTYPE_F32
)
{
cudnn_data_type
=
device
::
cud
a
::
getCudnnDtype
(
data_type
);
cudnn_data_type
=
device
::
nvidi
a
::
getCudnnDtype
(
data_type
);
}
else
if
(
data_type
==
INFINI_DTYPE_BF16
)
{
}
else
if
(
data_type
==
INFINI_DTYPE_BF16
)
{
cudnn_data_type
=
device
::
cud
a
::
getCudnnDtype
(
data_type
);
cudnn_data_type
=
device
::
nvidi
a
::
getCudnnDtype
(
data_type
);
}
else
{
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
...
@@ -253,32 +254,42 @@ private:
...
@@ -253,32 +254,42 @@ private:
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
#endif
public:
public:
Opaque
(
Opaque
&&
other
)
noexcept
Opaque
(
Opaque
&&
other
)
noexcept
:
internal
(
std
::
move
(
other
.
internal
)),
:
internal
(
std
::
move
(
other
.
internal
)),
x_desc
(
other
.
x_desc
),
workspace_size
(
other
.
workspace_size
)
y_desc
(
other
.
y_desc
),
// clang-format off
w_desc
(
other
.
w_desc
),
#ifdef ENABLE_CUDNN_API
b_desc
(
other
.
b_desc
),
,
x_desc
(
other
.
x_desc
),
act_desc
(
other
.
act_desc
),
,
y_desc
(
other
.
y_desc
),
conv_desc
(
other
.
conv_desc
),
,
w_desc
(
other
.
w_desc
),
algo
(
other
.
algo
),
,
b_desc
(
other
.
b_desc
),
workspace_size
(
other
.
workspace_size
)
{
,
act_desc
(
other
.
act_desc
),
,
conv_desc
(
other
.
conv_desc
),
,
algo
(
other
.
algo
)
#endif
// clang-format on
{
#ifdef ENABLE_CUDNN_API
other
.
x_desc
=
nullptr
;
other
.
x_desc
=
nullptr
;
other
.
y_desc
=
nullptr
;
other
.
y_desc
=
nullptr
;
other
.
w_desc
=
nullptr
;
other
.
w_desc
=
nullptr
;
other
.
b_desc
=
nullptr
;
other
.
b_desc
=
nullptr
;
other
.
act_desc
=
nullptr
;
other
.
act_desc
=
nullptr
;
other
.
conv_desc
=
nullptr
;
other
.
conv_desc
=
nullptr
;
#endif
other
.
workspace_size
=
0
;
other
.
workspace_size
=
0
;
}
}
~
Opaque
()
{
~
Opaque
()
{
#ifdef ENABLE_CUDNN_API
CLEANUP_CUDNN_DESCRIPTORS
();
CLEANUP_CUDNN_DESCRIPTORS
();
#endif
}
}
#ifdef ENABLE_CUDNN_API
infiniStatus_t
initializeCudnnContext
(
ConvInfo
&
info
,
infiniStatus_t
initializeCudnnContext
(
ConvInfo
&
info
,
infiniDtype_t
data_type
,
infiniDtype_t
data_type
,
cudnnDataType_t
compute_type
)
{
cudnnDataType_t
compute_type
)
{
...
@@ -320,17 +331,22 @@ public:
...
@@ -320,17 +331,22 @@ public:
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
#endif
static
inline
utils
::
Result
<
Opaque
>
create
(
static
inline
utils
::
Result
<
Opaque
>
create
(
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal_ptr
,
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal_ptr
,
ConvInfo
&
info
,
ConvInfo
&
info
,
infiniDtype_t
data_type
)
{
infiniDtype_t
data_type
)
{
#ifdef ENABLE_CUDNN_API
Opaque
opaque
(
internal_ptr
);
Opaque
opaque
(
internal_ptr
);
auto
status
=
opaque
.
initializeCudnnContext
(
info
,
data_type
,
CUDNN_DATA_FLOAT
);
auto
status
=
opaque
.
initializeCudnnContext
(
info
,
data_type
,
CUDNN_DATA_FLOAT
);
if
(
status
!=
INFINI_STATUS_SUCCESS
)
{
if
(
status
!=
INFINI_STATUS_SUCCESS
)
{
return
status
;
return
status
;
}
}
return
utils
::
Result
<
Opaque
>
(
std
::
move
(
opaque
));
return
utils
::
Result
<
Opaque
>
(
std
::
move
(
opaque
));
#else
return
INFINI_STATUS_SUCCESS
;
#endif
}
}
};
};
...
@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create(
...
@@ -351,7 +367,8 @@ infiniStatus_t Descriptor::create(
const
void
*
strides
,
const
void
*
strides
,
const
void
*
dilations
,
const
void
*
dilations
,
size_t
n
)
{
size_t
n
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
nvidia
::
Handle
*>
(
handle_
);
#ifdef ENABLE_CUDNN_API
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
dtype
=
y_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
...
@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create(
...
@@ -373,6 +390,9 @@ infiniStatus_t Descriptor::create(
handle
->
device
,
handle
->
device
,
handle
->
device_id
);
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
#else
return
INFINI_STATUS_SUCCESS
;
#endif
}
}
infiniStatus_t
Descriptor
::
calculate
(
infiniStatus_t
Descriptor
::
calculate
(
...
@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate(
...
@@ -383,6 +403,7 @@ infiniStatus_t Descriptor::calculate(
const
void
*
w
,
const
void
*
w
,
const
void
*
bias
,
const
void
*
bias
,
void
*
stream
)
const
{
void
*
stream
)
const
{
#ifdef ENABLE_CUDNN_API
const
float
alpha
=
1.0
f
,
beta
=
0.0
f
;
const
float
alpha
=
1.0
f
,
beta
=
0.0
f
;
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
CHECK_STATUS
(
_opaque
->
internal
->
useCudnn
(
CHECK_STATUS
(
_opaque
->
internal
->
useCudnn
(
...
@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate(
...
@@ -428,7 +449,8 @@ infiniStatus_t Descriptor::calculate(
}
}
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
#else
return
INFINI_STATUS_SUCCESS
;
#endif
}
}
}
// namespace op::conv::cuda
}
// namespace op::conv::nvidia
#endif // ENABLE_CUDNN_API
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment