Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
362f0187
Commit
362f0187
authored
Feb 28, 2026
by
wooway777
Browse files
issue/1032 - provide an alternate cuda swiglu
parent
718b18cf
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
494 additions
and
45 deletions
+494
-45
src/infiniop/ops/swiglu/cuda/kernel_cuda.cuh
src/infiniop/ops/swiglu/cuda/kernel_cuda.cuh
+83
-0
src/infiniop/ops/swiglu/info.h
src/infiniop/ops/swiglu/info.h
+62
-0
src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.h
src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.h
+8
-0
src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.maca
src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.maca
+115
-0
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cu
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cu
+118
-0
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cuh
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cuh
+8
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+50
-45
src/infiniop/ops/swiglu/swiglu_cuda.h
src/infiniop/ops/swiglu/swiglu_cuda.h
+48
-0
test/infinicore/ops/swiglu.py
test/infinicore/ops/swiglu.py
+2
-0
No files found.
src/infiniop/ops/swiglu/cuda/kernel_cuda.cuh
0 → 100644
View file @
362f0187
#ifndef __SWIGLU_CUDA_KERNEL_CUH__
#define __SWIGLU_CUDA_KERNEL_CUH__
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
float
x0
=
__bfloat162float
(
__low2bfloat16
(
x
));
float
x1
=
__bfloat162float
(
__high2bfloat16
(
x
));
float
sig0
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x0
)));
float
sig1
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x1
)));
return
__floats2bfloat162_rn
(
sig0
,
sig1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
float
xf
=
__bfloat162float
(
x
);
return
__float2bfloat16_rn
(
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
xf
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rn
(
__fadd_rn
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
}
template
<
typename
T
,
unsigned
int
BLOCK_SIZE
>
__device__
void
SwiGLUCudaKernel
(
T
*
c
,
const
T
*
a
,
const
T
*
b
,
int
length
,
size_t
batch
,
size_t
seq_len
,
size_t
hidden_dim
,
ptrdiff_t
c_strides_0
,
ptrdiff_t
c_strides_1
,
ptrdiff_t
a_strides_0
,
ptrdiff_t
a_strides_1
,
ptrdiff_t
b_strides_0
,
ptrdiff_t
b_strides_1
)
{
int
ind_c
=
0
;
int
ind_a
=
0
;
int
ind_b
=
0
;
int
tid
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
tid
<
length
)
{
ind_c
+=
tid
%
(
int
)
hidden_dim
;
ind_a
+=
tid
%
(
int
)
hidden_dim
;
ind_b
+=
tid
%
(
int
)
hidden_dim
;
tid
=
tid
/
(
int
)
hidden_dim
;
ind_c
+=
(
tid
%
(
int
)
seq_len
)
*
(
int
)
c_strides_1
;
ind_a
+=
(
tid
%
(
int
)
seq_len
)
*
(
int
)
a_strides_1
;
ind_b
+=
(
tid
%
(
int
)
seq_len
)
*
(
int
)
b_strides_1
;
tid
=
tid
/
(
int
)
seq_len
;
ind_c
+=
(
tid
%
(
int
)
batch
)
*
(
int
)
c_strides_0
;
ind_a
+=
(
tid
%
(
int
)
batch
)
*
(
int
)
a_strides_0
;
ind_b
+=
(
tid
%
(
int
)
batch
)
*
(
int
)
b_strides_0
;
T
gate
=
b
[
ind_b
];
T
up
=
a
[
ind_a
];
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
c
[
ind_c
]
=
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
c
[
ind_c
]
=
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
cuda_bfloat162
sig
=
sigmoid
(
gate
);
float
gate0
=
__bfloat162float
(
__low2bfloat16
(
gate
));
float
gate1
=
__bfloat162float
(
__high2bfloat16
(
gate
));
float
sig0
=
__bfloat162float
(
__low2bfloat16
(
sig
));
float
sig1
=
__bfloat162float
(
__high2bfloat16
(
sig
));
float
up0
=
__bfloat162float
(
__low2bfloat16
(
up
));
float
up1
=
__bfloat162float
(
__high2bfloat16
(
up
));
float
res0
=
__fmul_rn
(
__fmul_rn
(
gate0
,
sig0
),
up0
);
float
res1
=
__fmul_rn
(
__fmul_rn
(
gate1
,
sig1
),
up1
);
c
[
ind_c
]
=
__floats2bfloat162_rn
(
res0
,
res1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
cuda_bfloat16
sig
=
sigmoid
(
gate
);
float
gatef
=
__bfloat162float
(
gate
);
float
sigf
=
__bfloat162float
(
sig
);
float
upf
=
__bfloat162float
(
up
);
c
[
ind_c
]
=
__float2bfloat16_rn
(
__fmul_rn
(
__fmul_rn
(
gatef
,
sigf
),
upf
));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
c
[
ind_c
]
=
__fmul_rn
(
__fmul_rn
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
c
[
ind_c
]
=
gate
*
sigmoid
(
gate
)
*
up
;
}
}
}
#endif // __SWIGLU_CUDA_KERNEL_CUH__
src/infiniop/ops/swiglu/info.h
0 → 100644
View file @
362f0187
#ifndef __SWIGLU_CUDA_INFO_H__
#define __SWIGLU_CUDA_INFO_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
namespace
op
::
swiglu_cuda
{
class
SwiGLUCudaInfo
{
SwiGLUCudaInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
size_t
length
;
size_t
batch
,
seq_len
,
hidden_dim
;
ptrdiff_t
c_strides_0
,
c_strides_1
;
ptrdiff_t
a_strides_0
,
a_strides_1
;
ptrdiff_t
b_strides_0
,
b_strides_1
;
static
utils
::
Result
<
SwiGLUCudaInfo
>
createSwiGLUCudaInfo
(
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
auto
dtype
=
c_desc
->
dtype
();
if
(
dtype
!=
a_desc
->
dtype
()
||
dtype
!=
b_desc
->
dtype
())
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
auto
shape
=
c_desc
->
shape
();
CHECK_SAME_SHAPE
(
shape
,
a_desc
->
shape
(),
b_desc
->
shape
());
auto
ndim
=
c_desc
->
ndim
();
size_t
hidden_dim
=
shape
[
ndim
-
1
];
size_t
seq_len
=
shape
[
ndim
-
2
];
size_t
batch
=
(
ndim
==
3
?
shape
[
0
]
:
1
);
size_t
length
=
batch
*
seq_len
*
hidden_dim
;
ptrdiff_t
c_strides_0
=
(
ndim
==
3
?
c_desc
->
strides
()[
0
]
:
0
);
ptrdiff_t
c_strides_1
=
(
ndim
==
3
?
c_desc
->
strides
()[
1
]
:
c_desc
->
strides
()[
0
]);
ptrdiff_t
a_strides_0
=
(
ndim
==
3
?
a_desc
->
strides
()[
0
]
:
0
);
ptrdiff_t
a_strides_1
=
(
ndim
==
3
?
a_desc
->
strides
()[
1
]
:
a_desc
->
strides
()[
0
]);
ptrdiff_t
b_strides_0
=
(
ndim
==
3
?
b_desc
->
strides
()[
0
]
:
0
);
ptrdiff_t
b_strides_1
=
(
ndim
==
3
?
b_desc
->
strides
()[
1
]
:
b_desc
->
strides
()[
0
]);
return
utils
::
Result
<
SwiGLUCudaInfo
>
(
SwiGLUCudaInfo
{
dtype
,
length
,
batch
,
seq_len
,
hidden_dim
,
c_strides_0
,
c_strides_1
,
a_strides_0
,
a_strides_1
,
b_strides_0
,
b_strides_1
});
}
};
}
// namespace op::swiglu_cuda
#endif // __SWIGLU_CUDA_INFO_H__
src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.h
0 → 100644
View file @
362f0187
#ifndef __SWIGLU_CUDA_METAX_H__
#define __SWIGLU_CUDA_METAX_H__
#include "../swiglu_cuda.h"
DESCRIPTOR
(
metax
)
#endif
src/infiniop/ops/swiglu/metax/swiglu_metax_cuda.maca
0 → 100644
View file @
362f0187
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel_cuda.cuh"
#include "swiglu_metax_cuda.h"
template <typename T, unsigned int BLOCK_SIZE>
INFINIOP_METAX_KERNEL SwiGLUCuda(
T *c,
const T *a,
const T *b,
int length,
size_t batch, size_t seq_len, size_t hidden_dim,
ptrdiff_t c_strides_0, ptrdiff_t c_strides_1,
ptrdiff_t a_strides_0, ptrdiff_t a_strides_1,
ptrdiff_t b_strides_0, ptrdiff_t b_strides_1) {
SwiGLUCudaKernel<T, BLOCK_SIZE>(c, a, b, length, batch, seq_len, hidden_dim,
c_strides_0, c_strides_1,
a_strides_0, a_strides_1,
b_strides_0, b_strides_1);
}
namespace op::swiglu_cuda::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto info = SwiGLUCudaInfo::createSwiGLUCudaInfo(c_desc, a_desc, b_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE, typename T>
infiniStatus_t calculate_swiglu_cuda(
const SwiGLUCudaInfo &info,
T *c,
const T *a,
const T *b,
hcStream_t stream,
void *workspace) {
int length = (int)info.length;
int batch = (int)info.batch;
int seq_len = (int)info.seq_len;
int hidden_dim = (int)info.hidden_dim;
int c_strides_0 = (int)info.c_strides_0;
int c_strides_1 = (int)info.c_strides_1;
int a_strides_0 = (int)info.a_strides_0;
int a_strides_1 = (int)info.a_strides_1;
int b_strides_0 = (int)info.b_strides_0;
int b_strides_1 = (int)info.b_strides_1;
int num_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE;
SwiGLUCuda<T, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>(c, a, b, length, batch, seq_len, hidden_dim,
c_strides_0, c_strides_1,
a_strides_0, a_strides_1,
b_strides_0, b_strides_1);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream_) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, TDATA) \
calculate_swiglu_cuda<BLOCK_SIZE, TDATA>(_info, (TDATA *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
#define CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, __hpcc_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu_cuda::metax
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cu
0 → 100644
View file @
362f0187
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel_cuda.cuh"
#include "swiglu_nvidia_cuda.cuh"
template
<
typename
T
,
unsigned
int
BLOCK_SIZE
>
INFINIOP_CUDA_KERNEL
SwiGLUCuda
(
T
*
c
,
const
T
*
a
,
const
T
*
b
,
int
length
,
size_t
batch
,
size_t
seq_len
,
size_t
hidden_dim
,
ptrdiff_t
c_strides_0
,
ptrdiff_t
c_strides_1
,
ptrdiff_t
a_strides_0
,
ptrdiff_t
a_strides_1
,
ptrdiff_t
b_strides_0
,
ptrdiff_t
b_strides_1
)
{
SwiGLUCudaKernel
<
T
,
BLOCK_SIZE
>
(
c
,
a
,
b
,
length
,
batch
,
seq_len
,
hidden_dim
,
c_strides_0
,
c_strides_1
,
a_strides_0
,
a_strides_1
,
b_strides_0
,
b_strides_1
);
}
namespace
op
::
swiglu_cuda
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
auto
info
=
SwiGLUCudaInfo
::
createSwiGLUCudaInfo
(
c_desc
,
a_desc
,
b_desc
);
CHECK_RESULT
(
info
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
info
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
unsigned
int
BLOCK_SIZE
,
typename
T
>
infiniStatus_t
calculate_swiglu_cuda
(
const
SwiGLUCudaInfo
&
info
,
T
*
c
,
const
T
*
a
,
const
T
*
b
,
cudaStream_t
stream
,
void
*
workspace
)
{
int
length
=
(
int
)
info
.
length
;
int
batch
=
(
int
)
info
.
batch
;
int
seq_len
=
(
int
)
info
.
seq_len
;
int
hidden_dim
=
(
int
)
info
.
hidden_dim
;
int
c_strides_0
=
(
int
)
info
.
c_strides_0
;
int
c_strides_1
=
(
int
)
info
.
c_strides_1
;
int
a_strides_0
=
(
int
)
info
.
a_strides_0
;
int
a_strides_1
=
(
int
)
info
.
a_strides_1
;
int
b_strides_0
=
(
int
)
info
.
b_strides_0
;
int
b_strides_1
=
(
int
)
info
.
b_strides_1
;
int
num_blocks
=
(
length
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
;
SwiGLUCuda
<
T
,
BLOCK_SIZE
>
<<<
num_blocks
,
BLOCK_SIZE
,
0
,
stream
>>>
(
c
,
a
,
b
,
length
,
batch
,
seq_len
,
hidden_dim
,
c_strides_0
,
c_strides_1
,
a_strides_0
,
a_strides_1
,
b_strides_0
,
b_strides_1
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream_
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
#define CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, TDATA) \
calculate_swiglu_cuda<BLOCK_SIZE, TDATA>(_info, (TDATA *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
#define CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SWIGLU_CUDA(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_1024
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_512
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CALCULATE_SWIGLU_CUDA_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_4096
)
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::swiglu_cuda::nvidia
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia_cuda.cuh
0 → 100644
View file @
362f0187
#ifndef __SWIGLU_CUDA_NVIDIA_H__
#define __SWIGLU_CUDA_NVIDIA_H__
#include "../swiglu_cuda.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/swiglu/operator.cc
View file @
362f0187
...
...
@@ -6,11 +6,7 @@
#include "cpu/swiglu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "nvidia/swiglu_nvidia.cuh"
#endif
#include "nvidia/swiglu_nvidia_cuda.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
...
...
@@ -19,7 +15,7 @@
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "metax/swiglu_metax.h"
#include "metax/swiglu_metax
_cuda
.h"
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
...
...
@@ -48,33 +44,38 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
{a_desc, \
b_desc})
#define CREATE_CUDA(CASE, NAMESPACE) \
case CASE: \
return op::swiglu_cuda::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::swiglu_cuda::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
a_desc, \
b_desc)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
CREATE_CUDA
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
CREATE
_CUDA
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CREATE
_CUDA
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
CREATE
_CUDA
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -83,7 +84,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
CREATE
_CUDA
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
...
...
@@ -110,32 +111,34 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
#define GET_CUDA(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu_cuda::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
GET_CUDA
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
GET
_CUDA
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
GET
_CUDA
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
GET
_CUDA
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
GET
(
INFINI_DEVICE_HYGON
,
nvidia
);
GET
_CUDA
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -144,7 +147,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
GET
(
INFINI_DEVICE_METAX
,
metax
);
GET
_CUDA
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
...
...
@@ -177,33 +180,34 @@ __C infiniStatus_t infiniopSwiGLU(
return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
#define CALCULATE_CUDA(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::swiglu_cuda::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, a, b, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
CALCULATE_CUDA
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
CALCULATE
_CUDA
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CALCULATE
_CUDA
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
CALCULATE
_CUDA
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
CALCULATE
_CUDA
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -212,7 +216,7 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
CALCULATE
_CUDA
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
...
...
@@ -240,33 +244,34 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
#define DELETE_CUDA(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::swiglu_cuda::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
DELETE_CUDA
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
DELETE
_CUDA
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
DELETE
_CUDA
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
DELETE
_CUDA
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
DELETE
(
INFINI_DEVICE_HYGON
,
nvidia
);
DELETE
_CUDA
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -275,7 +280,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
DELETE
_CUDA
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
...
...
src/infiniop/ops/swiglu/swiglu_cuda.h
0 → 100644
View file @
362f0187
#ifndef SWIGLU_CUDA_H
#define SWIGLU_CUDA_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::swiglu_cuda::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SwiGLUCudaInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SwiGLUCudaInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t a_desc, \
infiniopTensorDescriptor_t b_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *c, \
const void *a, \
const void *b, \
void *stream) const; \
}; \
}
#endif // SWIGLU_CUDA_H
test/infinicore/ops/swiglu.py
View file @
362f0187
...
...
@@ -34,6 +34,8 @@ _TEST_CASES_DATA = [
# Large tensors
((
16
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
1
,
512
,
16384
),
(
16777216
,
32768
,
1
),
(
16777216
,
32768
,
1
),
None
),
((
1
,
16384
,
16384
),
(
536870912
,
32768
,
1
),
(
536870912
,
32768
,
1
),
None
),
]
# Tolerance configuration
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment