Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
d20b674e
"vscode:/vscode.git/clone" did not exist on "b5a809a0652d6506c284b9021354dbbfded7a03b"
Commit
d20b674e
authored
May 06, 2025
by
crapromer
Browse files
Migrate elementwise base from cuda to maca, and implement swiglu with test pass
parent
c203635b
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
518 additions
and
5 deletions
+518
-5
src/infiniop/devices/maca/maca_kernel_common.h
src/infiniop/devices/maca/maca_kernel_common.h
+68
-0
src/infiniop/elementwise/maca/elementwise_maca.h
src/infiniop/elementwise/maca/elementwise_maca.h
+264
-0
src/infiniop/elementwise/maca/elementwise_maca_api.h
src/infiniop/elementwise/maca/elementwise_maca_api.h
+59
-0
src/infiniop/ops/swiglu/maca/swiglu_maca.h
src/infiniop/ops/swiglu/maca/swiglu_maca.h
+8
-0
src/infiniop/ops/swiglu/maca/swiglu_maca.maca
src/infiniop/ops/swiglu/maca/swiglu_maca.maca
+56
-0
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+35
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+28
-5
No files found.
src/infiniop/devices/maca/maca_kernel_common.h
0 → 100644
View file @
d20b674e
#ifdef ENABLE_SUGON_MACA_API
#define INFINIOP_MACA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_MACA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
#define MACA_BLOCK_SIZE_512 512
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
namespace
device
::
maca
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
}
// namespace device::maca
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
}
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
expl
(
val
);
}
__forceinline__
__device__
double
exp_
(
const
double
val
)
{
return
exp
(
val
);
}
__forceinline__
__device__
__half
exp_
(
const
__half
x
)
{
return
hexp
(
x
);
}
#endif
src/infiniop/elementwise/maca/elementwise_maca.h
0 → 100644
View file @
d20b674e
#ifndef __INFINIOP_ELEMENTWISE_MACA_H__
#define __INFINIOP_ELEMENTWISE_MACA_H__
#include "../../../utils.h"
#include "../../devices/maca/common_maca.h"
#include "../../devices/maca/maca_kernel_common.h"
#include "elementwise_maca_api.h"
namespace
op
::
elementwise
::
maca
{
template
<
typename
T
>
__device__
__forceinline__
const
T
*
typedInputPtr
(
const
void
*
ptr
)
{
return
reinterpret_cast
<
const
T
*>
(
ptr
);
}
__device__
__forceinline__
size_t
getOutputIndex
(
size_t
idx
,
bool
is_contiguous
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
return
is_contiguous
?
idx
:
device
::
maca
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
}
struct
InputIndexer
{
size_t
idx
;
size_t
ndim
;
const
bool
*
input_contiguous
;
const
bool
*
input_broadcasted
;
const
size_t
*
input_shapes
;
const
ptrdiff_t
*
input_strides
;
const
ptrdiff_t
*
output_strides
;
__device__
__forceinline__
size_t
operator
()(
size_t
input_id
)
const
{
return
input_contiguous
[
input_id
]
?
idx
:
(
input_broadcasted
[
input_id
]
?
device
::
maca
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
maca
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
}
};
template
<
typename
F
,
size_t
...
Is
>
__device__
__forceinline__
void
unpackInputsAndApply
(
F
&&
f
,
std
::
index_sequence
<
Is
...
>
)
{
f
(
std
::
integral_constant
<
size_t
,
Is
>
{}...);
}
template
<
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
INFINIOP_MACA_KERNEL
elementwiseKernel
(
size_t
output_size
,
size_t
ndim
,
bool
output_contiguous
,
const
bool
*
__restrict__
input_contiguous
,
const
bool
*
__restrict__
input_broadcasted
,
const
size_t
*
__restrict__
output_shape
,
const
size_t
*
__restrict__
input_shapes
,
const
ptrdiff_t
*
__restrict__
output_strides
,
const
ptrdiff_t
*
__restrict__
input_strides
,
Tdata
*
output
,
const
void
*
const
*
inputs
,
size_t
offset
,
Args
...
args
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
+
offset
;
if
(
idx
<
output_size
)
{
const
Tdata
*
const
*
typed_inputs
=
reinterpret_cast
<
const
Tdata
*
const
*>
(
inputs
);
size_t
out_idx
=
getOutputIndex
(
idx
,
output_contiguous
,
ndim
,
output_shape
,
output_strides
);
InputIndexer
indexer
{
idx
,
ndim
,
input_contiguous
,
input_broadcasted
,
input_shapes
,
input_strides
,
output_strides
};
unpackInputsAndApply
(
[
&
](
auto
...
Is
)
{
output
[
out_idx
]
=
Op
{}(
typed_inputs
[
Is
.
value
][
indexer
(
Is
.
value
)]...,
std
::
forward
<
Args
>
(
args
)...);
},
std
::
make_index_sequence
<
N
>
{});
}
}
template
<
typename
Op
,
typename
Tout
,
typename
...
Tin
>
INFINIOP_MACA_KERNEL
elementwiseKernel
(
size_t
output_size
,
size_t
ndim
,
bool
output_contiguous
,
const
bool
*
__restrict__
input_contiguous
,
const
bool
*
__restrict__
input_broadcasted
,
const
size_t
*
__restrict__
output_shape
,
const
size_t
*
__restrict__
input_shapes
,
const
ptrdiff_t
*
__restrict__
output_strides
,
const
ptrdiff_t
*
__restrict__
input_strides
,
Tout
*
output
,
const
void
*
const
*
__restrict__
inputs
,
size_t
offset
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
+
offset
;
if
(
idx
<
output_size
)
{
size_t
out_idx
=
getOutputIndex
(
idx
,
output_contiguous
,
ndim
,
output_shape
,
output_strides
);
InputIndexer
indexer
{
idx
,
ndim
,
input_contiguous
,
input_broadcasted
,
input_shapes
,
input_strides
,
output_strides
};
unpackInputsAndApply
(
[
&
](
auto
...
Is
)
{
output
[
out_idx
]
=
Op
{}.
template
operator
()
<
Tout
,
Tin
...>(
(
typedInputPtr
<
Tin
>
(
inputs
[
Is
.
value
])[
indexer
(
Is
.
value
)])...);
},
std
::
index_sequence_for
<
Tin
...
>
{});
}
}
struct
DeviceImpl
::
Opaque
{
std
::
shared_ptr
<
device
::
maca
::
Handle
::
Internal
>
internal
;
Opaque
(
const
std
::
shared_ptr
<
device
::
maca
::
Handle
::
Internal
>
&
internal
)
:
internal
(
internal
)
{}
template
<
size_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
calculateImpl
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
hcStream_t
stream
,
Args
&&
...
args
)
{
return
launchElementwiseKernel
<
BLOCK_SIZE
,
N
>
(
info
,
workspace
,
reinterpret_cast
<
Tdata
*>
(
output
),
inputs
,
elementwiseKernel
<
N
,
Op
,
Tdata
,
Args
...
>
,
stream
,
std
::
forward
<
Args
>
(
args
)...);
}
template
<
size_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
=
0
>
infiniStatus_t
calculateImpl
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
hcStream_t
stream
,
Args
&&
...
args
)
{
return
launchElementwiseKernel
<
BLOCK_SIZE
,
N
>
(
info
,
workspace
,
reinterpret_cast
<
Tout
*>
(
output
),
inputs
,
elementwiseKernel
<
Op
,
Tout
,
Tin
...
>
,
stream
);
}
private:
template
<
size_t
N
>
infiniStatus_t
infoToDevice
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
const
void
*
const
*
h_inputs_arr
,
const
void
**&
d_inputs_arr
,
const
bool
*&
d_input_contiguous
,
const
bool
*&
d_input_broadcasted
,
const
size_t
*&
d_output_shape
,
const
ptrdiff_t
*&
d_output_strides
,
const
size_t
*&
d_input_shapes
,
const
ptrdiff_t
*&
d_input_strides
,
hcStream_t
stream
)
const
{
constexpr
auto
input_size
=
N
;
const
auto
ndim
=
info
.
getNdim
();
constexpr
auto
input_arr_size
=
N
*
sizeof
(
*
h_inputs_arr
);
const
int8_t
*
info_meta_start
=
info
.
getMetaStart
();
const
int8_t
*
d_meta_start
=
reinterpret_cast
<
int8_t
*>
(
workspace
)
+
input_arr_size
;
// copy the input pointer array and meta to device
CHECK_MACA
(
hcMemcpyAsync
(
workspace
,
h_inputs_arr
,
input_arr_size
,
hcMemcpyHostToDevice
,
stream
));
CHECK_MACA
(
hcMemcpyAsync
((
void
*
)
d_meta_start
,
info_meta_start
,
info
.
getMetaMemSize
(),
hcMemcpyHostToDevice
,
stream
));
// offset/assign the pointers
d_inputs_arr
=
reinterpret_cast
<
const
void
**>
(
workspace
);
d_output_shape
=
reinterpret_cast
<
const
size_t
*>
(
d_meta_start
);
d_output_strides
=
reinterpret_cast
<
const
ptrdiff_t
*>
(
d_output_shape
+
ndim
);
d_input_shapes
=
reinterpret_cast
<
const
size_t
*>
(
d_output_strides
+
ndim
);
d_input_strides
=
reinterpret_cast
<
const
ptrdiff_t
*>
(
d_input_shapes
+
input_size
*
ndim
);
d_input_contiguous
=
reinterpret_cast
<
const
bool
*>
(
d_input_strides
+
input_size
*
ndim
);
d_input_broadcasted
=
reinterpret_cast
<
const
bool
*>
(
d_input_contiguous
+
input_size
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
size_t
BLOCK_SIZE
,
size_t
N
,
typename
KernelFunc
,
typename
Tout
,
typename
...
Args
>
infiniStatus_t
launchElementwiseKernel
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
Tout
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
KernelFunc
kernel_func
,
hcStream_t
stream
,
Args
&&
...
args
)
{
auto
output_size
=
info
.
getOutputSize
();
if
(
output_size
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
// Device pointers
const
void
**
d_inputs_arr
=
nullptr
;
const
bool
*
d_input_contiguous
=
nullptr
;
const
bool
*
d_input_broadcasted
=
nullptr
;
const
size_t
*
d_output_shape
=
nullptr
;
const
ptrdiff_t
*
d_output_strides
=
nullptr
;
const
size_t
*
d_input_shapes
=
nullptr
;
const
ptrdiff_t
*
d_input_strides
=
nullptr
;
CHECK_STATUS
(
infoToDevice
<
N
>
(
info
,
workspace
,
inputs
.
data
(),
d_inputs_arr
,
d_input_contiguous
,
d_input_broadcasted
,
d_output_shape
,
d_output_strides
,
d_input_shapes
,
d_input_strides
,
stream
));
dim3
blockDims
(
std
::
min
(
BLOCK_SIZE
,
static_cast
<
size_t
>
(
internal
->
maxThreadsPerBlock
())));
dim3
gridDims
(
std
::
min
(
CEIL_DIV
(
output_size
,
blockDims
.
x
),
static_cast
<
size_t
>
(
internal
->
gridSizeX
())));
size_t
step
=
gridDims
.
x
*
blockDims
.
x
;
for
(
size_t
i
=
0
;
i
<
output_size
;
i
+=
step
)
{
kernel_func
<<<
gridDims
,
blockDims
,
0
,
stream
>>>
(
output_size
,
info
.
getNdim
(),
info
.
isOutputContiguous
(),
d_input_contiguous
,
d_input_broadcasted
,
d_output_shape
,
d_input_shapes
,
d_output_strides
,
d_input_strides
,
output
,
reinterpret_cast
<
const
void
**>
(
d_inputs_arr
),
i
,
std
::
forward
<
Args
>
(
args
)...);
}
return
INFINI_STATUS_SUCCESS
;
}
};
template
<
typename
...
Args
>
utils
::
Result
<
DeviceImpl
*>
DeviceImpl
::
create
(
Args
&&
...
args
)
{
auto
opaque
=
std
::
make_shared
<
Opaque
>
(
std
::
forward
<
Args
>
(
args
)...);
return
utils
::
Result
<
DeviceImpl
*>
(
new
DeviceImpl
(
opaque
));
}
/* Invoke elementwise operation for different input types */
template
<
unsigned
int
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
>
infiniStatus_t
DeviceImpl
::
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
)
{
constexpr
size_t
N
=
Op
::
num_inputs
;
static_assert
(
sizeof
...(
Tin
)
==
N
,
"Input type count mismatch"
);
return
_opaque
->
calculateImpl
<
BLOCK_SIZE
,
N
,
Op
,
Tout
,
Tin
...
>
(
info
,
workspace
,
output
,
inputs
,
reinterpret_cast
<
hcStream_t
>
(
stream
),
std
::
forward
<
Args
>
(
args
)...);
}
/* Invoke elementwise operation when all inputs have the same dtype */
template
<
unsigned
int
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
DeviceImpl
::
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
)
{
constexpr
size_t
N
=
Op
::
num_inputs
;
return
_opaque
->
calculateImpl
<
BLOCK_SIZE
,
N
,
Op
,
Tdata
>
(
info
,
workspace
,
output
,
inputs
,
reinterpret_cast
<
hcStream_t
>
(
stream
),
std
::
forward
<
Args
>
(
args
)...);
}
}
// namespace op::elementwise::maca
#endif
\ No newline at end of file
src/infiniop/elementwise/maca/elementwise_maca_api.h
0 → 100644
View file @
d20b674e
#ifndef __INFINIOP_ELEMENTWISE_MACA_API_H__
#define __INFINIOP_ELEMENTWISE_MACA_API_H__
#include "../elementwise.h"
namespace
op
::
elementwise
::
maca
{
class
DeviceImpl
final
{
struct
Opaque
;
std
::
shared_ptr
<
Opaque
>
_opaque
;
DeviceImpl
(
std
::
shared_ptr
<
Opaque
>
opaque
)
:
_opaque
(
std
::
move
(
opaque
))
{}
public:
~
DeviceImpl
()
=
default
;
template
<
typename
...
Args
>
static
utils
::
Result
<
DeviceImpl
*>
create
(
Args
&&
...
args
);
template
<
unsigned
int
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
);
template
<
unsigned
int
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
=
0
>
infiniStatus_t
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
);
};
}
// namespace op::elementwise::maca
#define CREATE_ELEMENTWISE_MACA_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
\
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
CHECK_RESULT(info_result); \
auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\
auto device_impl_result = op::elementwise::maca::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
\
*desc_ptr = new Descriptor( \
DTYPE, \
std::move(info), \
std::move(device_impl_result.take()), \
workspace_size, \
HANDLE->device, \
HANDLE->device_id);
#endif // __INFINIOP_ELEMENTWISE_MACA_API_H__
\ No newline at end of file
src/infiniop/ops/swiglu/maca/swiglu_maca.h
0 → 100644
View file @
d20b674e
#ifndef __SWIGLU_MACA_API_H__
#define __SWIGLU_MACA_API_H__
#include "../../../elementwise/maca/elementwise_maca_api.h"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
maca
)
#endif // __SWIGLU_MACA_API_H__
\ No newline at end of file
src/infiniop/ops/swiglu/maca/swiglu_maca.maca
0 → 100644
View file @
d20b674e
#include "swiglu_maca.h"
#include "swiglu_maca_internal.h"
namespace op::swiglu::maca {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MACA elementwise descriptor
CREATE_ELEMENTWISE_MACA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::maca
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
0 → 100644
View file @
d20b674e
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace
op
::
swiglu
::
maca
{
typedef
struct
SwiGLUOp
{
private:
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
const
{
// if constexpr (std::is_same_v<T, half2>) {
// return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
// } else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rd
(
__fadd_rd
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
}
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
up
,
const
T
&
gate
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rd
(
__fmul_rd
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
}
}
SwiGLUOp
;
}
// namespace op::swiglu::maca
\ No newline at end of file
src/infiniop/ops/swiglu/operator.cc
View file @
d20b674e
...
@@ -11,6 +11,9 @@
...
@@ -11,6 +11,9 @@
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#include "kunlun/swiglu_kunlun.h"
#endif
#endif
#ifdef ENABLE_METAX_API
#include "maca/swiglu_maca.h"
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#include "ascend/swiglu_ascend.h"
#endif
#endif
...
@@ -39,8 +42,13 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
...
@@ -39,8 +42,13 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
=======
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -84,13 +92,18 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
...
@@ -84,13 +92,18 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
=======
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
...
@@ -136,8 +149,13 @@ __C infiniStatus_t infiniopSwiGLU(
...
@@ -136,8 +149,13 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
=======
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -179,8 +197,13 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
...
@@ -179,8 +197,13 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
=======
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment