Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
e4605f7c
Unverified
Commit
e4605f7c
authored
Jul 11, 2025
by
PanZezhong1725
Committed by
GitHub
Jul 11, 2025
Browse files
Merge pull request #293 from YdrMaster/distinct-cuda
issue291 合并 cuda 代码
parents
5025ebed
eac2b0ca
Changes
68
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
170 additions
and
205 deletions
+170
-205
src/infiniop/devices/cuda/cuda_kernel_common.cuh
src/infiniop/devices/cuda/cuda_kernel_common.cuh
+6
-4
src/infiniop/devices/maca/maca_kernel_common.h
src/infiniop/devices/maca/maca_kernel_common.h
+7
-6
src/infiniop/elementwise/elementwise.h
src/infiniop/elementwise/elementwise.h
+39
-39
src/infiniop/ops/add/cpu/add_cpu.h
src/infiniop/ops/add/cpu/add_cpu.h
+1
-1
src/infiniop/ops/add/cuda/kernel.cuh
src/infiniop/ops/add/cuda/kernel.cuh
+0
-0
src/infiniop/ops/add/nvidia/add_nvidia.cu
src/infiniop/ops/add/nvidia/add_nvidia.cu
+8
-8
src/infiniop/ops/add/nvidia/add_nvidia.cuh
src/infiniop/ops/add/nvidia/add_nvidia.cuh
+1
-1
src/infiniop/ops/add/operator.cc
src/infiniop/ops/add/operator.cc
+8
-8
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
+7
-10
src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
+0
-60
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
+8
-0
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
...finiop/ops/causal_softmax/metax/causal_softmax_metax.maca
+19
-4
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
...finiop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+19
-4
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
...iniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
+8
-0
src/infiniop/ops/causal_softmax/operator.cc
src/infiniop/ops/causal_softmax/operator.cc
+24
-44
src/infiniop/ops/clip/cpu/clip_cpu.h
src/infiniop/ops/clip/cpu/clip_cpu.h
+1
-1
src/infiniop/ops/clip/cuda/kernel.cuh
src/infiniop/ops/clip/cuda/kernel.cuh
+0
-0
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
+8
-8
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
+1
-2
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+5
-5
No files found.
src/infiniop/devices/cuda/cuda_kernel_common.cuh
View file @
e4605f7c
...
@@ -4,6 +4,9 @@
...
@@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
#endif
#include <cuda_bf16.h>
#include <cuda_fp16.h>
// Posible maximum number of threads per block for CUDA architectures
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
#define CUDA_BLOCK_SIZE_4096 4096
...
@@ -12,8 +15,10 @@
...
@@ -12,8 +15,10 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace
device
::
cuda
{
using
cuda_bfloat16
=
nv_bfloat16
;
using
cuda_bfloat162
=
nv_bfloat162
;
namespace
device
::
cuda
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
indexToReducedOffset
(
...
@@ -45,8 +50,6 @@ indexToOffset(
...
@@ -45,8 +50,6 @@ indexToOffset(
}
}
}
// namespace device::cuda
}
// namespace device::cuda
#ifdef ENABLE_NVIDIA_API
#include <cuda_fp16.h>
__forceinline__
__device__
float
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
exp_
(
const
float
val
)
{
return
expf
(
val
);
return
expf
(
val
);
...
@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
...
@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_
(
const
__nv_bfloat16
x
)
{
exp_
(
const
__nv_bfloat16
x
)
{
return
hexp
(
x
);
return
hexp
(
x
);
}
}
#endif
src/infiniop/devices/maca/maca_kernel_common.h
View file @
e4605f7c
#define INFINIOP_MACA_KERNEL __global__ void
#define INFINIOP_MACA_KERNEL __global__ void
// Posible maximum number of threads per block for MACA architectures
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
#define MACA_BLOCK_SIZE_1024 1024
...
@@ -6,6 +7,9 @@
...
@@ -6,6 +7,9 @@
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using
cuda_bfloat16
=
hpcc_bfloat16
;
using
cuda_bfloat162
=
hpcc_bfloat162
;
namespace
device
::
maca
{
namespace
device
::
maca
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
...
@@ -39,8 +43,6 @@ indexToOffset(
...
@@ -39,8 +43,6 @@ indexToOffset(
}
}
}
// namespace device::maca
}
// namespace device::maca
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__
__device__
float
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
exp_
(
const
float
val
)
{
return
expf
(
val
);
return
expf
(
val
);
...
@@ -48,7 +50,7 @@ exp_(const float val) {
...
@@ -48,7 +50,7 @@ exp_(const float val) {
__forceinline__
__device__
long
double
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
exp_
(
const
long
double
val
)
{
return
exp
l
(
val
);
return
exp
(
val
);
}
}
__forceinline__
__device__
double
__forceinline__
__device__
double
...
@@ -61,8 +63,7 @@ exp_(const __half x) {
...
@@ -61,8 +63,7 @@ exp_(const __half x) {
return
hexp
(
x
);
return
hexp
(
x
);
}
}
__forceinline__
__device__
__hpcc_bfloat16
;
__forceinline__
__device__
__hpcc_bfloat16
exp_
(
const
__hpcc_bfloat16
;
x
)
{
exp_
(
const
__hpcc_bfloat16
x
)
{
return
hexp
(
x
);
return
hexp
(
x
);
}
}
#endif
src/infiniop/elementwise/elementwise.h
View file @
e4605f7c
...
@@ -12,45 +12,45 @@
...
@@ -12,45 +12,45 @@
#include <numeric>
#include <numeric>
#include <vector>
#include <vector>
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE
)
\
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE
, KERNEL_COMMON)
\
\
\
namespace op::OP::NAMESPACE { \
namespace op::OP::NAMESPACE {
\
class Descriptor final : public InfiniopDescriptor { \
class Descriptor final : public InfiniopDescriptor {
\
infiniDtype_t _dtype; \
infiniDtype_t _dtype;
\
op::elementwise::ElementwiseInfo _info; \
op::elementwise::ElementwiseInfo _info;
\
std::unique_ptr<op::elementwise::
NAMESPACE
::DeviceImpl> _device_info; \
std::unique_ptr<op::elementwise::
KERNEL_COMMON
::DeviceImpl> _device_info; \
size_t _workspace_size; \
size_t _workspace_size;
\
\
\
Descriptor( \
Descriptor(
\
infiniDtype_t dtype, \
infiniDtype_t dtype,
\
op::elementwise::ElementwiseInfo info, \
op::elementwise::ElementwiseInfo info,
\
op::elementwise::
NAMESPACE
::DeviceImpl *device_info, \
op::elementwise::
KERNEL_COMMON
::DeviceImpl *device_info, \
size_t workspace_size, \
size_t workspace_size,
\
infiniDevice_t device_type, \
infiniDevice_t device_type,
\
int device_id) \
int device_id)
\
: InfiniopDescriptor{device_type, device_id}, \
: InfiniopDescriptor{device_type, device_id},
\
_dtype(dtype), \
_dtype(dtype),
\
_info(std::move(info)), \
_info(std::move(info)),
\
_device_info(std::move(device_info)), \
_device_info(std::move(device_info)),
\
_workspace_size(workspace_size) {} \
_workspace_size(workspace_size) {}
\
\
\
public: \
public:
\
~Descriptor(); \
~Descriptor();
\
\
\
size_t workspaceSize() const { return _workspace_size; } \
size_t workspaceSize() const { return _workspace_size; }
\
\
\
static infiniStatus_t create( \
static infiniStatus_t create(
\
infiniopHandle_t handle, \
infiniopHandle_t handle,
\
Descriptor **desc_ptr, \
Descriptor **desc_ptr,
\
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t output_desc,
\
std::vector<infiniopTensorDescriptor_t> input_descs); \
std::vector<infiniopTensorDescriptor_t> input_descs);
\
\
\
infiniStatus_t calculate( \
infiniStatus_t calculate(
\
void *workspace, size_t workspace_size, \
void *workspace, size_t workspace_size,
\
void *output, \
void *output,
\
std::vector<const void *> inputs, \
std::vector<const void *> inputs,
\
void *stream) const; \
void *stream) const;
\
}; \
};
\
}
}
namespace
op
::
elementwise
{
namespace
op
::
elementwise
{
...
...
src/infiniop/ops/add/cpu/add_cpu.h
View file @
e4605f7c
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
add
,
cpu
)
ELEMENTWISE_DESCRIPTOR
(
add
,
cpu
,
cpu
)
namespace
op
::
add
::
cpu
{
namespace
op
::
add
::
cpu
{
typedef
struct
AddOp
{
typedef
struct
AddOp
{
...
...
src/infiniop/ops/add/cuda/
add_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/add/cuda/
k
ern
e
l.cuh
View file @
e4605f7c
File moved
src/infiniop/ops/add/
cud
a/add_
cud
a.cu
→
src/infiniop/ops/add/
nvidi
a/add_
nvidi
a.cu
View file @
e4605f7c
#include "
add_cuda
.cuh"
#include "
../cuda/kernel
.cuh"
#include "add_
cuda_internal
.cuh"
#include "add_
nvidia
.cuh"
namespace
op
::
add
::
cud
a
{
namespace
op
::
add
::
nvidi
a
{
Descriptor
::~
Descriptor
()
=
default
;
Descriptor
::~
Descriptor
()
=
default
;
...
@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(
...
@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
AddOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
AddOp
,
__nv_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
__nv_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
AddOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
AddOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace op::add::
cud
a
}
// namespace op::add::
nvidi
a
src/infiniop/ops/add/
cud
a/add_
cud
a.cuh
→
src/infiniop/ops/add/
nvidi
a/add_
nvidi
a.cuh
View file @
e4605f7c
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
add
,
cuda
)
ELEMENTWISE_DESCRIPTOR
(
add
,
nvidia
,
cuda
)
#endif // __ADD_CUDA_API_H__
#endif // __ADD_CUDA_API_H__
src/infiniop/ops/add/operator.cc
View file @
e4605f7c
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include "cpu/add_cpu.h"
#include "cpu/add_cpu.h"
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/add_
cud
a.cuh"
#include "
nvidi
a/add_
nvidi
a.cuh"
#endif
#endif
__C
infiniStatus_t
infiniopCreateAddDescriptor
(
__C
infiniStatus_t
infiniopCreateAddDescriptor
(
...
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
...
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
default:
default:
...
@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
...
@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \
#define GET(CASE, NAMESPACE) \
case CASE: \
case CASE: \
*size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
*size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd(
...
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
default:
default:
...
@@ -99,7 +99,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
...
@@ -99,7 +99,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
#define DELETE(CASE, NAMESPACE) \
case CASE: \
case CASE: \
delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
switch
(
desc
->
device_type
)
{
...
@@ -107,7 +107,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
...
@@ -107,7 +107,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
default:
default:
...
...
src/infiniop/ops/causal_softmax/cuda/
causal_softmax_
kernel.cuh
→
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
View file @
e4605f7c
#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
#
ifndef
__CAUSAL_SOFTMAX_KERNEL_CUH__
#define __CAUSAL_SOFTMAX_KERNEL_CUH__
#define __CAUSAL_SOFTMAX_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
causalSoftmax
(
__device__
void
causalSoftmax
Kernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
Tdata
*
y_
,
const
Tdata
*
x_
,
size_t
batch
,
size_t
height
,
size_t
width
,
size_t
batch
,
size_t
height
,
size_t
width
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
...
@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
...
@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
// 2 | * * * ... * * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
#ifdef ENABLE_NVIDIA_API
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
||
std
::
is_same_v
<
Tdata
,
cuda_bfloat16
>
)
{
y
[
col
]
=
exp
_
(
x
[
col
]
-
max_
);
y
[
col
]
=
h
exp
(
x
[
col
]
-
max_
);
#
else
}
else
{
y
[
col
]
=
exp
(
x
[
col
]
-
max_
);
y
[
col
]
=
exp
(
x
[
col
]
-
max_
);
#endif
}
}
else
{
}
else
{
y
[
col
]
=
Tdata
(
0
);
y
[
col
]
=
Tdata
(
0
);
}
}
...
...
src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
deleted
100644 → 0
View file @
5025ebed
#ifndef __CAUSAL_SOFTMAX_KERNEL_H__
#define __CAUSAL_SOFTMAX_KERNEL_H__
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/maca/reduce.h"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
INFINIOP_MACA_KERNEL
causalSoftmax
(
Tdata
*
y_
,
const
Tdata
*
x_
,
size_t
batch
,
size_t
height
,
size_t
width
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
ptrdiff_t
x_stride_b
,
ptrdiff_t
x_stride_h
)
{
Tdata
*
y
=
y_
// threadIdx.x for col_id
+
blockIdx
.
y
*
y_stride_b
// gridDim.y for batch_id
+
blockIdx
.
x
*
y_stride_h
;
// gridDim.x for row_id
const
Tdata
*
x
=
x_
+
blockIdx
.
y
*
x_stride_b
+
blockIdx
.
x
*
x_stride_h
;
// [Reduce] Find max value in each row and store in shared memory
__shared__
Tdata
max_
;
Tdata
max_0
=
op
::
common_maca
::
reduce_op
::
max
<
BLOCK_SIZE
,
Tdata
>
(
x
,
width
-
height
+
1
+
blockIdx
.
x
);
if
(
threadIdx
.
x
==
0
)
{
max_
=
max_0
;
}
__syncthreads
();
// [Elementwise] Subtract max value from each element and apply causal mask
for
(
size_t
col
=
threadIdx
.
x
;
col
<
width
;
col
+=
BLOCK_SIZE
)
{
// row_id ↓ |<- width ->|
// 0 | * * * ... * |
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
#ifdef ENABLE_MACA_API
y
[
col
]
=
exp_
(
x
[
col
]
-
max_
);
#else
y
[
col
]
=
exp
(
x
[
col
]
-
max_
);
#endif
}
else
{
y
[
col
]
=
Tdata
(
0
);
}
}
__syncthreads
();
// [Reduce] Find the sum of each updated row and store in shared memory
__shared__
Tcompute
sum_
;
Tcompute
sum_0
=
op
::
common_maca
::
reduce_op
::
sum
<
BLOCK_SIZE
,
Tdata
,
Tcompute
>
(
y
,
width
);
if
(
threadIdx
.
x
==
0
)
{
sum_
=
sum_0
;
}
__syncthreads
();
// [Elementwise] Divide each element by the sum and store in shared memory
for
(
size_t
col
=
threadIdx
.
x
;
col
<
width
;
col
+=
BLOCK_SIZE
)
{
y
[
col
]
/=
Tdata
(
sum_
);
}
}
#endif // __CAUSAL_SOFTMAX_KERNEL_H__
src/infiniop/ops/causal_softmax/m
aca
/causal_softmax_m
aca
.h
→
src/infiniop/ops/causal_softmax/m
etax
/causal_softmax_m
etax
.h
View file @
e4605f7c
#ifndef __CAUSAL_SOFTMAX_M
ACA
_H__
#ifndef __CAUSAL_SOFTMAX_M
ETAX
_H__
#define __CAUSAL_SOFTMAX_M
ACA
_H__
#define __CAUSAL_SOFTMAX_M
ETAX
_H__
#include "../causal_softmax.h"
#include "../causal_softmax.h"
DESCRIPTOR
(
m
aca
)
DESCRIPTOR
(
m
etax
)
#endif
#endif
src/infiniop/ops/causal_softmax/m
aca
/causal_softmax_m
aca
.maca
→
src/infiniop/ops/causal_softmax/m
etax
/causal_softmax_m
etax
.maca
View file @
e4605f7c
#include "../../../devices/maca/common_maca.h"
#include "../../../devices/maca/common_maca.h"
#include "causal_softmax_kernel.h"
#include "causal_softmax_metax.h"
#include "causal_softmax_maca.h"
namespace op::causal_softmax::maca {
#include <hccub/block/block_reduce.cuh>
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MACA_KERNEL causalSoftmax(
Tdata *y, const Tdata *x,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
}
namespace op::causal_softmax::metax {
struct Descriptor::Opaque {
struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal;
std::shared_ptr<device::maca::Handle::Internal> internal;
...
@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
...
@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS;
}
}
} // namespace op::causal_softmax::m
aca
} // namespace op::causal_softmax::m
etax
src/infiniop/ops/causal_softmax/
cud
a/causal_softmax_
cud
a.cu
→
src/infiniop/ops/causal_softmax/
nvidi
a/causal_softmax_
nvidi
a.cu
View file @
e4605f7c
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_common.cuh"
#include "causal_softmax_cuda.cuh"
#include "causal_softmax_nvidia.cuh"
#include "causal_softmax_kernel.cuh"
namespace
op
::
causal_softmax
::
cuda
{
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
causalSoftmax
(
Tdata
*
y
,
const
Tdata
*
x
,
size_t
batch
,
size_t
height
,
size_t
width
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
ptrdiff_t
x_stride_b
,
ptrdiff_t
x_stride_h
)
{
causalSoftmaxKernel
<
BLOCK_SIZE
,
Tdata
,
Tcompute
>
(
y
,
x
,
batch
,
height
,
width
,
y_stride_b
,
y_stride_h
,
x_stride_b
,
x_stride_h
);
}
namespace
op
::
causal_softmax
::
nvidia
{
struct
Descriptor
::
Opaque
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
...
@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
...
@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace op::causal_softmax::
cud
a
}
// namespace op::causal_softmax::
nvidi
a
src/infiniop/ops/causal_softmax/
cud
a/causal_softmax_
cud
a.cuh
→
src/infiniop/ops/causal_softmax/
nvidi
a/causal_softmax_
nvidi
a.cuh
View file @
e4605f7c
#ifndef __CAUSAL_SOFTMAX_
CUD
A_H__
#ifndef __CAUSAL_SOFTMAX_
NVIDI
A_H__
#define __CAUSAL_SOFTMAX_
CUD
A_H__
#define __CAUSAL_SOFTMAX_
NVIDI
A_H__
#include "../causal_softmax.h"
#include "../causal_softmax.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif
#endif
src/infiniop/ops/causal_softmax/operator.cc
View file @
e4605f7c
...
@@ -6,10 +6,10 @@
...
@@ -6,10 +6,10 @@
#include "cpu/causal_softmax_cpu.h"
#include "cpu/causal_softmax_cpu.h"
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/causal_softmax_
cud
a.cuh"
#include "
nvidi
a/causal_softmax_
nvidi
a.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
#include "m
aca
/causal_softmax_m
aca
.h"
#include "m
etax
/causal_softmax_m
etax
.h"
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
#include "ascend/causal_softmax_ascend.h"
#include "ascend/causal_softmax_ascend.h"
...
@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
...
@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
)
CREATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
...
@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateCausalSoftmaxDescriptor
((
MacaHandle_t
)
handle
,
(
CausalSoftmaxMacaDescriptor_t
*
)
desc_ptr
,
y_desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaCreateCausalSoftmaxDescriptor
((
MusaHandle_t
)
handle
,
(
CausalSoftmaxMusaDescriptor_t
*
)
desc_ptr
,
y_desc
);
return
musaCreateCausalSoftmaxDescriptor
((
MusaHandle_t
)
handle
,
(
CausalSoftmaxMusaDescriptor_t
*
)
desc_ptr
,
y_desc
);
...
@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
...
@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
...
@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetCausalSoftmaxWorkspaceSize
((
CausalSoftmaxMacaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaGetCausalSoftmaxWorkspaceSize
((
CausalSoftmaxMusaDescriptor_t
)
desc
,
size
);
return
musaGetCausalSoftmaxWorkspaceSize
((
CausalSoftmaxMusaDescriptor_t
)
desc
,
size
);
...
@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax(
...
@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
)
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
...
@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCausalSoftmax
((
CausalSoftmaxMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
data
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaCausalSoftmax
((
CausalSoftmaxMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
data
,
stream
);
return
musaCausalSoftmax
((
CausalSoftmaxMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
data
,
stream
);
...
@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
...
@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
maca
)
DESTROY
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
...
@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_API
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyCausalSoftmaxDescriptor
((
CausalSoftmaxMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
case
DevMthreadsGpu
:
return
musaDestroyCausalSoftmaxDescriptor
((
CausalSoftmaxMusaDescriptor_t
)
desc
);
return
musaDestroyCausalSoftmaxDescriptor
((
CausalSoftmaxMusaDescriptor_t
)
desc
);
...
...
src/infiniop/ops/clip/cpu/clip_cpu.h
View file @
e4605f7c
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
cpu
)
ELEMENTWISE_DESCRIPTOR
(
clip
,
cpu
,
cpu
)
namespace
op
::
clip
::
cpu
{
namespace
op
::
clip
::
cpu
{
...
...
src/infiniop/ops/clip/cuda/
clip_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/clip/cuda/
k
ern
e
l.cuh
View file @
e4605f7c
File moved
src/infiniop/ops/clip/
cud
a/clip_
cud
a.cu
→
src/infiniop/ops/clip/
nvidi
a/clip_
nvidi
a.cu
View file @
e4605f7c
#include "
clip_cuda
.cuh"
#include "
../cuda/kernel
.cuh"
#include "clip_
cuda_internal
.cuh"
#include "clip_
nvidia
.cuh"
namespace
op
::
clip
::
cud
a
{
namespace
op
::
clip
::
nvidi
a
{
Descriptor
::~
Descriptor
()
=
default
;
Descriptor
::~
Descriptor
()
=
default
;
...
@@ -45,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
...
@@ -45,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
__nv_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
__nv_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace op::clip::
cud
a
}
// namespace op::clip::
nvidi
a
src/infiniop/ops/clip/
cud
a/clip_
cud
a.cuh
→
src/infiniop/ops/clip/
nvidi
a/clip_
nvidi
a.cuh
View file @
e4605f7c
...
@@ -2,8 +2,7 @@
...
@@ -2,8 +2,7 @@
#define __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
cuda
)
ELEMENTWISE_DESCRIPTOR
(
clip
,
nvidia
,
cuda
)
#endif // __CLIP_CUDA_API_H__
#endif // __CLIP_CUDA_API_H__
src/infiniop/ops/clip/operator.cc
View file @
e4605f7c
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include "cpu/clip_cpu.h"
#include "cpu/clip_cpu.h"
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/clip_
cud
a.cuh"
#include "
nvidi
a/clip_
nvidi
a.cuh"
#endif
#endif
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
...
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
...
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
default:
default:
...
@@ -53,7 +53,7 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
...
@@ -53,7 +53,7 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
#endif
#endif
}
}
...
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopClip(
...
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopClip(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
default:
default:
...
@@ -107,7 +107,7 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
...
@@ -107,7 +107,7 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
default:
default:
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment