Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
0166515c
"qa/L0_tensorflow_lint/pylintrc" did not exist on "0963b288168bae18df8360000b835cd601b4de33"
Unverified
Commit
0166515c
authored
Aug 07, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 07, 2025
Browse files
Merge branch 'main' into issue/300
parents
f0300ff3
a23c4d13
Changes
175
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
441 additions
and
186 deletions
+441
-186
src/infiniop/ops/add/metax/add_metax.h
src/infiniop/ops/add/metax/add_metax.h
+8
-0
src/infiniop/ops/add/metax/add_metax.maca
src/infiniop/ops/add/metax/add_metax.maca
+62
-0
src/infiniop/ops/add/nvidia/add_nvidia.cu
src/infiniop/ops/add/nvidia/add_nvidia.cu
+13
-9
src/infiniop/ops/add/nvidia/add_nvidia.cuh
src/infiniop/ops/add/nvidia/add_nvidia.cuh
+8
-0
src/infiniop/ops/add/operator.cc
src/infiniop/ops/add/operator.cc
+40
-13
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
+7
-10
src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
+0
-60
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
+8
-0
src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
...finiop/ops/causal_softmax/metax/causal_softmax_metax.maca
+32
-11
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
...finiop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+22
-7
src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
...iniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
+8
-0
src/infiniop/ops/causal_softmax/operator.cc
src/infiniop/ops/causal_softmax/operator.cc
+41
-49
src/infiniop/ops/clip/cpu/clip_cpu.cc
src/infiniop/ops/clip/cpu/clip_cpu.cc
+3
-1
src/infiniop/ops/clip/cuda/kernel.cuh
src/infiniop/ops/clip/cuda/kernel.cuh
+4
-7
src/infiniop/ops/clip/metax/clip_metax.h
src/infiniop/ops/clip/metax/clip_metax.h
+8
-0
src/infiniop/ops/clip/metax/clip_metax.maca
src/infiniop/ops/clip/metax/clip_metax.maca
+63
-0
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
+13
-9
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
+8
-0
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+37
-10
src/infiniop/ops/conv/conv.h
src/infiniop/ops/conv/conv.h
+56
-0
No files found.
src/infiniop/ops/add/metax/add_metax.h
0 → 100644
View file @
0166515c
#ifndef __ADD_METAX_API_H__
#define __ADD_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
add
,
metax
)
#endif // __ADD_METAX_API_H__
src/infiniop/ops/add/metax/add_metax.maca
0 → 100644
View file @
0166515c
#include "add_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::add::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::metax
src/infiniop/ops/
sub/cuda/sub_cud
a.cu
→
src/infiniop/ops/
add/nvidia/add_nvidi
a.cu
View file @
0166515c
#include "sub_cuda.cuh"
#include "sub_cuda_internal.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
namespace
op
::
sub
::
cuda
{
#include "../cuda/kernel.cuh"
#include "add_nvidia.cuh"
namespace
op
::
add
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
...
...
@@ -11,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
...
...
@@ -20,7 +22,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
...
...
@@ -43,15 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
SubOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
Sub
Op
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
Add
Op
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
Sub
Op
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
Add
Op
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::
sub::cud
a
}
// namespace op::
add::nvidi
a
src/infiniop/ops/add/
cud
a/add_
cud
a.cuh
→
src/infiniop/ops/add/
nvidi
a/add_
nvidi
a.cuh
View file @
0166515c
#ifndef __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__
#include "../../../elementwise/
cud
a/elementwise_
cud
a_api.cuh"
#include "../../../elementwise/
nvidi
a/elementwise_
nvidi
a_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
add
,
cud
a
)
ELEMENTWISE_DESCRIPTOR
(
add
,
nvidi
a
)
#endif // __ADD_CUDA_API_H__
src/infiniop/ops/add/operator.cc
View file @
0166515c
...
...
@@ -5,8 +5,11 @@
#ifdef ENABLE_CPU_API
#include "cpu/add_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/add_cuda.cuh"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/add_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/add_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateAddDescriptor
(
...
...
@@ -30,8 +33,14 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
@@ -46,14 +55,20 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_
CUDA
_API
GET
(
INFINI_DEVICE_
NVIDIA
,
cuda
)
#ifdef ENABLE_
METAX
_API
GET
(
INFINI_DEVICE_
METAX
,
metax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -82,8 +97,14 @@ __C infiniStatus_t infiniopAdd(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
@@ -99,15 +120,21 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
src/infiniop/ops/causal_softmax/cuda/
causal_softmax_
kernel.cuh
→
src/infiniop/ops/causal_softmax/cuda/kernel.cuh
View file @
0166515c
#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
#
ifndef
__CAUSAL_SOFTMAX_KERNEL_CUH__
#define __CAUSAL_SOFTMAX_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
causalSoftmax
(
__device__
void
causalSoftmax
Kernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
size_t
batch
,
size_t
height
,
size_t
width
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
...
...
@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
// 2 | * * * ... * * * |
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
#ifdef ENABLE_CUDA_API
y
[
col
]
=
exp
_
(
x
[
col
]
-
max_
);
#
else
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
||
std
::
is_same_v
<
Tdata
,
cuda_bfloat16
>
)
{
y
[
col
]
=
h
exp
(
x
[
col
]
-
max_
);
}
else
{
y
[
col
]
=
exp
(
x
[
col
]
-
max_
);
#endif
}
}
else
{
y
[
col
]
=
Tdata
(
0
);
}
...
...
src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
deleted
100644 → 0
View file @
f0300ff3
#ifndef __CAUSAL_SOFTMAX_KERNEL_H__
#define __CAUSAL_SOFTMAX_KERNEL_H__
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/maca/reduce.h"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
INFINIOP_MACA_KERNEL
causalSoftmax
(
Tdata
*
y_
,
const
Tdata
*
x_
,
size_t
batch
,
size_t
height
,
size_t
width
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
ptrdiff_t
x_stride_b
,
ptrdiff_t
x_stride_h
)
{
Tdata
*
y
=
y_
// threadIdx.x for col_id
+
blockIdx
.
y
*
y_stride_b
// gridDim.y for batch_id
+
blockIdx
.
x
*
y_stride_h
;
// gridDim.x for row_id
const
Tdata
*
x
=
x_
+
blockIdx
.
y
*
x_stride_b
+
blockIdx
.
x
*
x_stride_h
;
// [Reduce] Find max value in each row and store in shared memory
__shared__
Tdata
max_
;
Tdata
max_0
=
op
::
common_maca
::
reduce_op
::
max
<
BLOCK_SIZE
,
Tdata
>
(
x
,
width
-
height
+
1
+
blockIdx
.
x
);
if
(
threadIdx
.
x
==
0
)
{
max_
=
max_0
;
}
__syncthreads
();
// [Elementwise] Subtract max value from each element and apply causal mask
for
(
size_t
col
=
threadIdx
.
x
;
col
<
width
;
col
+=
BLOCK_SIZE
)
{
// row_id ↓ |<- width ->|
// 0 | * * * ... * |
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if
(
width
+
blockIdx
.
x
>=
threadIdx
.
x
+
height
)
{
#ifdef ENABLE_MACA_API
y
[
col
]
=
exp_
(
x
[
col
]
-
max_
);
#else
y
[
col
]
=
exp
(
x
[
col
]
-
max_
);
#endif
}
else
{
y
[
col
]
=
Tdata
(
0
);
}
}
__syncthreads
();
// [Reduce] Find the sum of each updated row and store in shared memory
__shared__
Tcompute
sum_
;
Tcompute
sum_0
=
op
::
common_maca
::
reduce_op
::
sum
<
BLOCK_SIZE
,
Tdata
,
Tcompute
>
(
y
,
width
);
if
(
threadIdx
.
x
==
0
)
{
sum_
=
sum_0
;
}
__syncthreads
();
// [Elementwise] Divide each element by the sum and store in shared memory
for
(
size_t
col
=
threadIdx
.
x
;
col
<
width
;
col
+=
BLOCK_SIZE
)
{
y
[
col
]
/=
Tdata
(
sum_
);
}
}
#endif // __CAUSAL_SOFTMAX_KERNEL_H__
src/infiniop/ops/causal_softmax/m
aca
/causal_softmax_m
aca
.h
→
src/infiniop/ops/causal_softmax/m
etax
/causal_softmax_m
etax
.h
View file @
0166515c
#ifndef __CAUSAL_SOFTMAX_M
ACA
_H__
#define __CAUSAL_SOFTMAX_M
ACA
_H__
#ifndef __CAUSAL_SOFTMAX_M
ETAX
_H__
#define __CAUSAL_SOFTMAX_M
ETAX
_H__
#include "../causal_softmax.h"
DESCRIPTOR
(
m
aca
)
DESCRIPTOR
(
m
etax
)
#endif
src/infiniop/ops/causal_softmax/m
aca
/causal_softmax_m
aca
.maca
→
src/infiniop/ops/causal_softmax/m
etax
/causal_softmax_m
etax
.maca
View file @
0166515c
#include "../../../devices/maca/common_maca.h"
#include "causal_softmax_kernel.h"
#include "causal_softmax_maca.h"
#include "../../../devices/metax/metax_common.h"
#include "causal_softmax_metax.h"
namespace op::causal_softmax::maca {
#include <hccub/block/block_reduce.cuh>
#include "../../../devices/metax/metax_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_METAX_KERNEL causalSoftmax(
Tdata *y, const Tdata *x,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
}
namespace op::causal_softmax::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::m
aca
::Handle::Internal> internal;
std::shared_ptr<device::m
etax
::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
...
...
@@ -20,7 +35,7 @@ infiniStatus_t Descriptor::create(
auto info = CausalSoftmaxInfo::create(y_desc, x_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::m
aca
::Handle *>(handle)->internal()},
new Opaque{reinterpret_cast<device::m
etax
::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
...
...
@@ -38,6 +53,12 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
batch_size, seq_len, total_seq_len,
y_stride_b, y_stride_i,
x_stride_b, x_stride_i);
} else if (dtype == INFINI_DTYPE_BF16) {
causalSoftmax<BLOCK_SIZE, __hpcc_bfloat16, float>
<<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
batch_size, seq_len, total_seq_len,
y_stride_b, y_stride_i,
x_stride_b, x_stride_i);
} else if (dtype == INFINI_DTYPE_F32) {
causalSoftmax<BLOCK_SIZE, float, float>
<<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
...
...
@@ -55,12 +76,12 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
const void *x,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
if (_opaque->internal->maxThreadsPerBlock() == M
ACA
_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<M
ACA
_BLOCK_SIZE_1024>(
if (_opaque->internal->maxThreadsPerBlock() == M
ETAX
_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<M
ETAX
_BLOCK_SIZE_1024>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == M
ACA
_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<M
ACA
_BLOCK_SIZE_512>(
} else if (_opaque->internal->maxThreadsPerBlock() == M
ETAX
_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<M
ETAX
_BLOCK_SIZE_512>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else {
...
...
@@ -69,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return INFINI_STATUS_SUCCESS;
}
} // namespace op::causal_softmax::m
aca
} // namespace op::causal_softmax::m
etax
src/infiniop/ops/causal_softmax/
cud
a/causal_softmax_
cud
a.cu
→
src/infiniop/ops/causal_softmax/
nvidi
a/causal_softmax_
nvidi
a.cu
View file @
0166515c
#include "../../../devices/cuda/cuda_common.cuh"
#include "causal_softmax_cuda.cuh"
#include "causal_softmax_kernel.cuh"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "causal_softmax_nvidia.cuh"
namespace
op
::
causal_softmax
::
cuda
{
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
causalSoftmax
(
Tdata
*
y
,
const
Tdata
*
x
,
size_t
batch
,
size_t
height
,
size_t
width
,
ptrdiff_t
y_stride_b
,
ptrdiff_t
y_stride_h
,
ptrdiff_t
x_stride_b
,
ptrdiff_t
x_stride_h
)
{
causalSoftmaxKernel
<
BLOCK_SIZE
,
Tdata
,
Tcompute
>
(
y
,
x
,
batch
,
height
,
width
,
y_stride_b
,
y_stride_h
,
x_stride_b
,
x_stride_h
);
}
namespace
op
::
causal_softmax
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
...
...
@@ -20,7 +35,7 @@ infiniStatus_t Descriptor::create(
auto
info
=
CausalSoftmaxInfo
::
create
(
y_desc
,
x_desc
);
CHECK_RESULT
(
info
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle
)
->
internal
()},
new
Opaque
{
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle
)
->
internal
()},
info
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::causal_softmax::
cud
a
}
// namespace op::causal_softmax::
nvidi
a
src/infiniop/ops/causal_softmax/
cud
a/causal_softmax_
cud
a.cuh
→
src/infiniop/ops/causal_softmax/
nvidi
a/causal_softmax_
nvidi
a.cuh
View file @
0166515c
#ifndef __CAUSAL_SOFTMAX_
CUD
A_H__
#define __CAUSAL_SOFTMAX_
CUD
A_H__
#ifndef __CAUSAL_SOFTMAX_
NVIDI
A_H__
#define __CAUSAL_SOFTMAX_
NVIDI
A_H__
#include "../causal_softmax.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif
src/infiniop/ops/causal_softmax/operator.cc
View file @
0166515c
...
...
@@ -5,11 +5,11 @@
#ifdef ENABLE_CPU_API
#include "cpu/causal_softmax_cpu.h"
#endif
#ifdef
ENABLE_CUDA
_API
#include "
cud
a/causal_softmax_
cud
a.cuh"
#if
def
ined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR
_API
)
#include "
nvidi
a/causal_softmax_
nvidi
a.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca
/causal_softmax_m
aca
.h"
#include "m
etax
/causal_softmax_m
etax
.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/causal_softmax_ascend.h"
...
...
@@ -33,11 +33,17 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
)
CREATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -45,14 +51,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateCausalSoftmaxDescriptor
((
MacaHandle_t
)
handle
,
(
CausalSoftmaxMacaDescriptor_t
*
)
desc_ptr
,
y_desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateCausalSoftmaxDescriptor
((
MusaHandle_t
)
handle
,
(
CausalSoftmaxMusaDescriptor_t
*
)
desc_ptr
,
y_desc
);
...
...
@@ -73,8 +71,17 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -83,17 +90,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
}
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetCausalSoftmaxWorkspaceSize
((
CausalSoftmaxMacaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetCausalSoftmaxWorkspaceSize
((
CausalSoftmaxMusaDescriptor_t
)
desc
,
size
);
...
...
@@ -119,11 +115,17 @@ __C infiniStatus_t infiniopCausalSoftmax(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
)
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -131,14 +133,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCausalSoftmax
((
CausalSoftmaxMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
data
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCausalSoftmax
((
CausalSoftmaxMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
data
,
stream
);
...
...
@@ -159,11 +153,17 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
maca
)
DESTROY
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_ASCEND_API
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -171,14 +171,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_API
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyCausalSoftmaxDescriptor
((
CausalSoftmaxMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaDestroyCausalSoftmaxDescriptor
((
CausalSoftmaxMusaDescriptor_t
)
desc
);
...
...
src/infiniop/ops/clip/cpu/clip_cpu.cc
View file @
0166515c
...
...
@@ -21,7 +21,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
min_shape
=
min_desc
->
shape
();
const
auto
&
max_shape
=
max_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
out_shape
,
in_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
min_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
max_shape
);
...
...
@@ -45,6 +45,8 @@ infiniStatus_t Descriptor::calculate(
return
_device_info
->
calculate
<
ClipOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
ClipOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
ClipOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
...
...
src/infiniop/ops/clip/cuda/
clip_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/clip/cuda/
k
ern
e
l.cuh
View file @
0166515c
#ifndef __CLIP_CUDA_H__
#define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
clip
::
cuda
{
typedef
struct
ClipOp
{
...
...
@@ -13,15 +9,16 @@ public:
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
const
T
&
min_val
,
const
T
&
max_val
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
nv
_bfloat162
>
)
{
#ifndef ENABLE_ILUVATAR_
CUDA_
API
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
cuda
_bfloat162
>
)
{
#ifndef ENABLE_ILUVATAR_API
return
__hmax2
(
__hmin2
(
x
,
max_val
),
min_val
);
#else
return
{
std
::
clamp
(
x
.
x
,
min_val
.
x
,
max_val
.
x
),
std
::
clamp
(
x
.
y
,
min_val
.
y
,
max_val
.
y
)};
#endif
}
}
else
{
return
std
::
clamp
(
x
,
min_val
,
max_val
);
}
}
}
ClipOp
;
}
// namespace op::clip::cuda
...
...
src/infiniop/ops/clip/metax/clip_metax.h
0 → 100644
View file @
0166515c
#ifndef __CLIP_METAX_API_H__
#define __CLIP_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
metax
)
#endif // __CLIP_METAX_API_H__
src/infiniop/ops/clip/metax/clip_metax.maca
0 → 100644
View file @
0166515c
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "clip_metax.h"
namespace op::clip::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::ClipOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::ClipOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::metax
src/infiniop/ops/clip/
cud
a/clip_
cud
a.cu
→
src/infiniop/ops/clip/
nvidi
a/clip_
nvidi
a.cu
View file @
0166515c
#include "clip_cuda.cuh"
#include "clip_cuda_internal.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
namespace
op
::
clip
::
cuda
{
#include "../cuda/kernel.cuh"
#include "clip_nvidia.cuh"
namespace
op
::
clip
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
...
...
@@ -11,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
in_desc
=
input_desc_vec
.
at
(
0
);
...
...
@@ -22,7 +24,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
min_shape
=
min_desc
->
shape
();
const
auto
&
max_shape
=
max_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
out_shape
,
in_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
min_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
max_shape
);
...
...
@@ -45,15 +47,17 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::clip::
cud
a
}
// namespace op::clip::
nvidi
a
src/infiniop/ops/clip/
cud
a/clip_
cud
a.cuh
→
src/infiniop/ops/clip/
nvidi
a/clip_
nvidi
a.cuh
View file @
0166515c
#ifndef __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "infiniop/ops/clip.h"
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
clip
,
cud
a
)
ELEMENTWISE_DESCRIPTOR
(
clip
,
nvidi
a
)
#endif // __CLIP_CUDA_API_H__
src/infiniop/ops/clip/operator.cc
View file @
0166515c
...
...
@@ -5,8 +5,11 @@
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/clip_cuda.cuh"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/clip_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/clip_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
...
...
@@ -30,8 +33,14 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
@@ -52,8 +61,14 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
}
...
...
@@ -82,8 +97,14 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
@@ -106,8 +127,14 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
src/infiniop/ops/conv/conv.h
0 → 100644
View file @
0166515c
#ifndef __CONV_H__
#define __CONV_H__
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::conv::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
infiniDtype_t _dtype; \
ConvInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
ConvInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dtype(dtype), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y, \
infiniopTensorDescriptor_t x, \
infiniopTensorDescriptor_t w, \
infiniopTensorDescriptor_t b, \
const void *pads, \
const void *strides, \
const void *dilations, \
size_t n); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
const void *w, \
const void *bias, \
void *stream) const; \
}; \
}
#endif // __CONV_H__
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment