Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
05247bb7
Commit
05247bb7
authored
Jul 10, 2025
by
PanZezhong
Browse files
issue/291/refactor: 适配沐曦
Signed-off-by:
PanZezhong
<
panzezhong@qiyuanlab.com
>
parent
abf1e021
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
155 additions
and
158 deletions
+155
-158
src/infiniop/devices/cuda/cuda_kernel_common.cuh
src/infiniop/devices/cuda/cuda_kernel_common.cuh
+1
-0
src/infiniop/devices/maca/maca_kernel_common.h
src/infiniop/devices/maca/maca_kernel_common.h
+5
-7
src/infiniop/ops/gemm/maca/gemm_maca.cc
src/infiniop/ops/gemm/maca/gemm_maca.cc
+5
-4
src/infiniop/ops/rms_norm/cuda/kernel.cuh
src/infiniop/ops/rms_norm/cuda/kernel.cuh
+2
-2
src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
+29
-11
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
+21
-9
src/infiniop/ops/rope/cuda/kernel.cuh
src/infiniop/ops/rope/cuda/kernel.cuh
+4
-4
src/infiniop/ops/rope/maca/rope_maca_kernel.h
src/infiniop/ops/rope/maca/rope_maca_kernel.h
+0
-42
src/infiniop/ops/rope/metax/rope_metax.h
src/infiniop/ops/rope/metax/rope_metax.h
+1
-1
src/infiniop/ops/rope/metax/rope_metax.maca
src/infiniop/ops/rope/metax/rope_metax.maca
+29
-4
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
+23
-3
src/infiniop/ops/rope/operator.cc
src/infiniop/ops/rope/operator.cc
+5
-5
src/infiniop/ops/swiglu/cuda/kernel.cuh
src/infiniop/ops/swiglu/cuda/kernel.cuh
+6
-10
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+0
-40
src/infiniop/ops/swiglu/metax/swiglu_metax.h
src/infiniop/ops/swiglu/metax/swiglu_metax.h
+1
-1
src/infiniop/ops/swiglu/metax/swiglu_metax.maca
src/infiniop/ops/swiglu/metax/swiglu_metax.maca
+13
-8
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
+4
-1
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+5
-5
xmake.lua
xmake.lua
+1
-1
No files found.
src/infiniop/devices/cuda/cuda_kernel_common.cuh
View file @
05247bb7
...
...
@@ -16,6 +16,7 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
using
cuda_bfloat16
=
nv_bfloat16
;
using
cuda_bfloat162
=
nv_bfloat162
;
namespace
device
::
cuda
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
...
...
src/infiniop/devices/maca/maca_kernel_common.h
View file @
05247bb7
#define INFINIOP_MACA_KERNEL __global__ void
#include <maca_bf16.h>
#include <maca_fp16.h>
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
...
...
@@ -10,7 +7,8 @@
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using
cuda_bfloat16
=
maca_bfloat16
;
using
cuda_bfloat16
=
hpcc_bfloat16
;
using
cuda_bfloat162
=
hpcc_bfloat162
;
namespace
device
::
maca
{
...
...
@@ -52,7 +50,7 @@ exp_(const float val) {
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
exp
l
(
val
);
return
exp
(
val
);
}
__forceinline__
__device__
double
...
...
@@ -65,7 +63,7 @@ exp_(const __half x) {
return
hexp
(
x
);
}
__forceinline__
__device__
__hpcc_bfloat16
;
exp_
(
const
__hpcc_bfloat16
;
x
)
{
__forceinline__
__device__
__hpcc_bfloat16
exp_
(
const
__hpcc_bfloat16
x
)
{
return
hexp
(
x
);
}
src/infiniop/ops/gemm/maca/gemm_maca.cc
View file @
05247bb7
...
...
@@ -21,9 +21,7 @@ infiniStatus_t Descriptor::create(
auto
handle
=
reinterpret_cast
<
device
::
maca
::
Handle
*>
(
handle_
);
auto
dtype
=
c_desc
->
dtype
();
if
(
dtype
!=
INFINI_DTYPE_F16
&&
dtype
!=
INFINI_DTYPE_F32
)
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
auto
result
=
MatmulInfo
::
create
(
c_desc
,
a_desc
,
b_desc
,
MatrixLayout
::
COL_MAJOR
);
CHECK_RESULT
(
result
);
...
...
@@ -53,7 +51,10 @@ infiniStatus_t Descriptor::calculate(
a_type
=
b_type
=
c_type
=
HPCC_R_16F
;
compute_type
=
HCBLAS_COMPUTE_32F
;
break
;
case
INFINI_DTYPE_BF16
:
a_type
=
b_type
=
c_type
=
HPCC_R_16BF
;
compute_type
=
HCBLAS_COMPUTE_32F
;
break
;
case
INFINI_DTYPE_F32
:
a_type
=
b_type
=
c_type
=
HPCC_R_32F
;
compute_type
=
HCBLAS_COMPUTE_32F_FAST_TF32
;
...
...
src/infiniop/ops/rms_norm/cuda/kernel.cuh
View file @
05247bb7
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tweight
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
rmsnormBlock
(
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tcompute
,
typename
Tdata
,
typename
Tweight
>
__device__
void
rmsnormBlock
(
Tdata
*
__restrict__
y
,
ptrdiff_t
stride_y
,
const
Tdata
*
__restrict__
x
,
...
...
src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
View file @
05247bb7
#include "../../../devices/maca/common_maca.h"
#include "../cuda/rms_norm_kernel.cuh"
#include "rms_norm_metax.cuh"
#include "../../../devices/maca/maca_kernel_common.h"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
INFINIOP_MACA_KERNEL rmsnormKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y,
const Tdata *__restrict__ x,
ptrdiff_t stride_x,
const Tweight *__restrict__ w,
size_t dim,
float epsilon) {
rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
}
namespace op::rms_norm::maca {
struct Descriptor::Opaque {
...
...
@@ -46,14 +64,14 @@ infiniStatus_t launchKernel(
float epsilon,
hcStream_t maca_stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnorm
Block
<BLOCK_SIZE, Tdata, Tweight
, Tcompute
><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)
\
rmsnorm
Kernel
<BLOCK_SIZE,
Tcompute,
Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
reinterpret_cast<Tdata *>(y),
\
stride_y,
\
reinterpret_cast<const Tdata *>(x),
\
stride_x,
\
reinterpret_cast<const Tweight *>(w),
\
dim,
\
epsilon)
if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
...
...
@@ -91,8 +109,8 @@ infiniStatus_t Descriptor::calculate(
auto maca_stream = reinterpret_cast<hcStream_t>(stream);
// launch kernel with different block sizes
if (_opaque->internal->maxThreadsPerBlock() ==
CUD
A_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<
CUD
A_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
if (_opaque->internal->maxThreadsPerBlock() ==
MAC
A_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<
MAC
A_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
...
...
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
View file @
05247bb7
...
...
@@ -8,6 +8,18 @@
#include "../cuda/kernel.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tcompute
,
typename
Tdata
,
typename
Tweight
>
INFINIOP_CUDA_KERNEL
rmsnormKernel
(
Tdata
*
__restrict__
y
,
ptrdiff_t
stride_y
,
const
Tdata
*
__restrict__
x
,
ptrdiff_t
stride_x
,
const
Tweight
*
__restrict__
w
,
size_t
dim
,
float
epsilon
)
{
rmsnormBlock
<
BLOCK_SIZE
,
Tcompute
>
(
y
,
stride_y
,
x
,
stride_x
,
w
,
dim
,
epsilon
);
}
namespace
op
::
rms_norm
::
nvidia
{
struct
Descriptor
::
Opaque
{
...
...
@@ -52,14 +64,14 @@ infiniStatus_t launchKernel(
float
epsilon
,
cudaStream_t
cuda_stream
)
{
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnorm
Block
<BLOCK_SIZE, Tdata, Tweight
, Tcompute
><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)
\
rmsnorm
Kernel
<BLOCK_SIZE,
Tcompute,
Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y),
\
stride_y,
\
reinterpret_cast<const Tdata *>(x),
\
stride_x,
\
reinterpret_cast<const Tweight *>(w),
\
dim,
\
epsilon)
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_F16
)
{
...
...
@@ -108,4 +120,4 @@ infiniStatus_t Descriptor::calculate(
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::rms_norm::
cud
a
}
// namespace op::rms_norm::
nvidi
a
src/infiniop/ops/rope/cuda/kernel.cuh
View file @
05247bb7
...
...
@@ -2,7 +2,7 @@
#define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropeThreadPerItem
(
__device__
void
ropeThreadPerItem
Block
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
...
...
@@ -28,9 +28,9 @@ INFINIOP_CUDA_KERNEL ropeThreadPerItem(
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
__nv
_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
__nv
_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
__nv
_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda
_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda
_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda
_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
x0
=
__low2bfloat16
(
x
);
Tangle
x1
=
__high2bfloat16
(
x
);
...
...
src/infiniop/ops/rope/maca/rope_maca_kernel.h
deleted
100644 → 0
View file @
abf1e021
#ifndef __INFINIOP_ROPE_MACA_KERNEL_H__
#define __INFINIOP_ROPE_MACA_KERNEL_H__
#include "../../../devices/maca/maca_kernel_common.h"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_MACA_KERNEL
ropeThreadPerItem
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
auto
y_offset
=
blockIdx
.
x
*
y_stride_seqlen
+
blockIdx
.
y
*
y_stride_nhead
;
auto
x_offset
=
blockIdx
.
x
*
x_stride_seqlen
+
blockIdx
.
y
*
x_stride_nhead
;
size_t
pos_id
=
size_t
(
pos_ids
[
blockIdx
.
x
]);
auto
table_offset
=
pos_id
*
table_dim
;
for
(
size_t
i
=
threadIdx
.
x
;
i
<
table_dim
;
i
+=
blockDim
.
x
)
{
Tangle
sin__
=
sin_table
[
table_offset
+
i
],
cos__
=
cos_table
[
table_offset
+
i
];
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
}
}
}
#endif
src/infiniop/ops/rope/m
aca
/rope_m
aca
.h
→
src/infiniop/ops/rope/m
etax
/rope_m
etax
.h
View file @
05247bb7
...
...
@@ -3,6 +3,6 @@
#include "../rope.h"
DESCRIPTOR
(
m
aca
)
DESCRIPTOR
(
m
etax
)
#endif // __INFINIOP_ROPE_MACA_H__
src/infiniop/ops/rope/m
aca
/rope_m
aca
.maca
→
src/infiniop/ops/rope/m
etax
/rope_m
etax
.maca
View file @
05247bb7
#include "../../../devices/maca/common_maca.h"
#include "rope_maca.h"
#include "rope_maca_kernel.h"
#include "rope_metax.h"
#include "../../../devices/maca/maca_kernel_common.h"
#include "../cuda/kernel.cuh"
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_MACA_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
y_stride_seqlen, y_stride_nhead,
x_stride_seqlen, x_stride_nhead);
}
namespace op::rope::m
aca
{
namespace op::rope::m
etax
{
struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal;
...
...
@@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
ropeThreadPerItem
Kernel
<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
...
...
@@ -102,6 +125,8 @@ infiniStatus_t Descriptor::calculate(
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(cuda_bfloat16);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
case INFINI_DTYPE_F64:
...
...
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
View file @
05247bb7
...
...
@@ -5,6 +5,26 @@
#include "../cuda/kernel.cuh"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropeThreadPerItemKernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
ropeThreadPerItemBlock
(
y_
,
x_
,
pos_ids
,
sin_table
,
cos_table
,
table_dim
,
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
);
}
namespace
op
::
rope
::
nvidia
{
struct
Descriptor
::
Opaque
{
...
...
@@ -53,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
dimy
=
uint32_t
(
info
.
nhead
);
int
nthreads
=
std
::
max
(
int
(
info
.
table_dim
),
block_size
);
ropeThreadPerItem
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
ropeThreadPerItem
Kernel
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
y
,
x
,
pos_ids
,
sin_table
,
cos_table
,
info
.
table_dim
,
info
.
y_stride_seqlen
,
info
.
y_stride_nhead
,
info
.
x_stride_seqlen
,
info
.
x_stride_nhead
);
...
...
@@ -106,7 +126,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F16
:
ROPE_TYPE
(
half
);
case
INFINI_DTYPE_BF16
:
ROPE_TYPE
(
__nv
_bfloat16
);
ROPE_TYPE
(
cuda
_bfloat16
);
case
INFINI_DTYPE_F32
:
ROPE_TYPE
(
float
);
case
INFINI_DTYPE_F64
:
...
...
@@ -121,4 +141,4 @@ infiniStatus_t Descriptor::calculate(
#undef ROPE_TYPE
#undef CALCULATE_ROPE
}
// namespace op::rope::
cud
a
}
// namespace op::rope::
nvidi
a
src/infiniop/ops/rope/operator.cc
View file @
05247bb7
...
...
@@ -12,7 +12,7 @@
#include "ascend/rope_ascend.h"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca
/rope_m
aca
.h"
#include "m
etax
/rope_m
etax
.h"
#endif
__C
infiniStatus_t
infiniopCreateRoPEDescriptor
(
...
...
@@ -43,7 +43,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
...
...
@@ -84,7 +84,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
);
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -135,7 +135,7 @@ __C infiniStatus_t infiniopRoPE(
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -181,7 +181,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
src/infiniop/ops/swiglu/cuda/kernel.cuh
View file @
05247bb7
#ifndef __SWIGLU_CUDA_H__
#define __SWIGLU_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
swiglu
::
cuda
{
typedef
struct
SwiGLUOp
{
private:
...
...
@@ -14,13 +10,13 @@ private:
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
__nv
_bfloat162
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda
_bfloat162
>
)
{
float
x0
=
__bfloat162float
(
__low2bfloat16
(
x
));
float
x1
=
__bfloat162float
(
__high2bfloat16
(
x
));
float
sig0
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x0
)));
float
sig1
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x1
)));
return
__floats2bfloat162_rn
(
sig0
,
sig1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
__nv
_bfloat16
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda
_bfloat16
>
)
{
float
xf
=
__bfloat162float
(
x
);
return
__float2bfloat16_rn
(
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
xf
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
...
...
@@ -38,8 +34,8 @@ public:
return
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
__nv
_bfloat162
>
)
{
__nv
_bfloat162
sig
=
sigmoid
(
gate
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda
_bfloat162
>
)
{
cuda
_bfloat162
sig
=
sigmoid
(
gate
);
float
gate0
=
__bfloat162float
(
__low2bfloat16
(
gate
));
float
gate1
=
__bfloat162float
(
__high2bfloat16
(
gate
));
float
sig0
=
__bfloat162float
(
__low2bfloat16
(
sig
));
...
...
@@ -49,8 +45,8 @@ public:
float
res0
=
__fmul_rn
(
__fmul_rn
(
gate0
,
sig0
),
up0
);
float
res1
=
__fmul_rn
(
__fmul_rn
(
gate1
,
sig1
),
up1
);
return
__floats2bfloat162_rn
(
res0
,
res1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
__nv
_bfloat16
>
)
{
__nv
_bfloat16
sig
=
sigmoid
(
gate
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda
_bfloat16
>
)
{
cuda
_bfloat16
sig
=
sigmoid
(
gate
);
float
gatef
=
__bfloat162float
(
gate
);
float
sigf
=
__bfloat162float
(
sig
);
float
upf
=
__bfloat162float
(
up
);
...
...
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
deleted
100644 → 0
View file @
abf1e021
#ifndef __SWIGLU_MACA_H__
#define __SWIGLU_MACA_H__
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace
op
::
swiglu
::
maca
{
typedef
struct
SwiGLUOp
{
private:
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rn
(
__fadd_rn
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
}
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
up
,
const
T
&
gate
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
__fmul_rn
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
}
}
SwiGLUOp
;
}
// namespace op::swiglu::maca
#endif
src/infiniop/ops/swiglu/m
aca
/swiglu_m
aca
.h
→
src/infiniop/ops/swiglu/m
etax
/swiglu_m
etax
.h
View file @
05247bb7
...
...
@@ -3,6 +3,6 @@
#include "../../../elementwise/maca/elementwise_maca_api.h"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
maca
)
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
metax
,
maca
)
#endif // __SWIGLU_MACA_API_H__
src/infiniop/ops/swiglu/m
aca
/swiglu_m
aca
.maca
→
src/infiniop/ops/swiglu/m
etax
/swiglu_m
etax
.maca
View file @
05247bb7
#include "swiglu_maca.h"
#include "swiglu_maca_internal.h"
#include "swiglu_metax.h"
namespace op::swiglu::maca {
#include "../../../elementwise/maca/elementwise_maca.h"
#include "../cuda/kernel.cuh"
namespace op::swiglu::metax {
Descriptor::~Descriptor() = default;
...
...
@@ -20,7 +23,7 @@ infiniStatus_t Descriptor::create(
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16,
INFINI_DTYPE_BF16,
INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MACA elementwise descriptor
...
...
@@ -42,15 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256,
cuda::
SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256,
cuda::
SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::m
aca
} // namespace op::swiglu::m
etax
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
View file @
05247bb7
#include "swiglu_nvidia.cuh"
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
namespace
op
::
swiglu
::
nvidia
{
...
...
@@ -44,7 +47,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
__nv
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
cuda
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
...
...
src/infiniop/ops/swiglu/operator.cc
View file @
05247bb7
...
...
@@ -12,7 +12,7 @@
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca
/swiglu_m
aca
.h"
#include "m
etax
/swiglu_m
etax
.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
...
...
@@ -46,7 +46,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -96,7 +96,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
);
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -190,7 +190,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
xmake.lua
View file @
05247bb7
...
...
@@ -174,7 +174,7 @@ target("infini-utils")
add_cxflags
(
"-fPIC"
,
"-Wno-unknown-pragmas"
)
if
has_config
(
"omp"
)
then
add_cxflags
(
"-fopenmp"
)
add_ldflags
(
"-fopenmp"
)
add_ldflags
(
"-fopenmp"
,
{
force
=
true
}
)
end
end
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment