Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8b760951
Commit
8b760951
authored
Aug 20, 2025
by
zhangyue
Browse files
Merge branch 'main' of
https://github.com/InfiniTensor/InfiniCore
into issue-385
parents
eb3972eb
d4b03cf7
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
226 additions
and
61 deletions
+226
-61
src/infiniop/ops/swiglu/bang/swiglu_bang_internal.mlu
src/infiniop/ops/swiglu/bang/swiglu_bang_internal.mlu
+37
-0
src/infiniop/ops/swiglu/moore/siwglu_moore_kernel.h
src/infiniop/ops/swiglu/moore/siwglu_moore_kernel.h
+86
-0
src/infiniop/ops/swiglu/moore/swiglu_moore.h
src/infiniop/ops/swiglu/moore/swiglu_moore.h
+8
-0
src/infiniop/ops/swiglu/moore/swiglu_moore.mu
src/infiniop/ops/swiglu/moore/swiglu_moore.mu
+61
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+22
-47
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+1
-1
src/infinirt/moore/infinirt_moore.cc
src/infinirt/moore/infinirt_moore.cc
+1
-1
src/infinirt/moore/infinirt_moore.h
src/infinirt/moore/infinirt_moore.h
+0
-0
test/infiniop-test/test_generate/infiniop_test.py
test/infiniop-test/test_generate/infiniop_test.py
+1
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+2
-4
test/infiniop/swiglu.py
test/infiniop/swiglu.py
+2
-2
xmake.lua
xmake.lua
+1
-1
xmake/bang.lua
xmake/bang.lua
+1
-1
xmake/moore.lua
xmake/moore.lua
+3
-3
No files found.
src/infiniop/ops/swiglu/bang/swiglu_bang_internal.mlu
0 → 100644
View file @
8b760951
#ifndef __SWIGLU_BANG_INTERNAL_H__
#define __SWIGLU_BANG_INTERNAL_H__
#include "../../../elementwise/bang/elementwise_bang_kernel.h"
#include "bang.h"
#include "bang_device_functions.h"
typedef struct SwiGLUOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
__mlu_device__ void operator()(T *out, const T *up, const T *gate, size_t num_elements) const {
if constexpr (std::is_same_v<T, half> || std::is_same_v<T, bfloat16_t>) {
__bang_active_sigmoid(out, gate, num_elements);
__bang_mul(out, out, gate, num_elements);
__bang_mul(out, out, up, num_elements);
} else if constexpr (std::is_same_v<T, float>) {
__bang_neg(out, gate, num_elements);
__bang_active_exphp(out, out, num_elements);
__bang_add_scalar(out, out, 1.0f, num_elements);
__bang_div(out, gate, out, num_elements);
__bang_mul(out, up, out, num_elements);
} else {
for (size_t i = 0; i < num_elements; ++i) {
out[i] = up[i] * gate[i] / (1.0 + std::exp(-gate[i]));
}
}
}
} SwiGLUOp;
LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, half)
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, bfloat16_t)
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
#endif // __SWIGLU_BANG_INTERNAL_H__
src/infiniop/ops/swiglu/moore/siwglu_moore_kernel.h
0 → 100644
View file @
8b760951
#ifndef __SWIGLU_CUDA_H__
#define __SWIGLU_CUDA_H__
/*
* This file contains the SwiGLU operation implementation for the MUSA backend.
*
* It uses the 'op::swiglu::cuda' namespace to maintain a consistent code structure
* and interface with the CUDA implementation, ensuring code alignment across different
* hardware platforms.
*/
namespace
op
::
swiglu
::
cuda
{
typedef
struct
SwiGLUOp
{
private:
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// This implementation uses standard floating-point arithmetic to calculate the sigmoid function,
// ensuring portability across on MUSA platforms.
//
// The original CUDA implementation's reliance on platform-specific intrinsics like hrcp for half-precision,
// which was not supported on the MUSA platform.
// To resolve this, the half-precision input is first converted to a higher-precision float,
// the calculation is performed, and the result is cast back to half.
float
xf
=
__half2float
(
x
);
float
sigf
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
xf
));
return
__float2half
(
sigf
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
float
x0
=
__bfloat162float
(
__low2bfloat16
(
x
));
float
x1
=
__bfloat162float
(
__high2bfloat16
(
x
));
float
sig0
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x0
)));
float
sig1
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x1
)));
return
__floats2bfloat162_rn
(
sig0
,
sig1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
float
xf
=
__bfloat162float
(
x
);
return
__float2bfloat16_rn
(
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
xf
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rn
(
__fadd_rn
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
}
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
up
,
const
T
&
gate
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
cuda_bfloat162
sig
=
sigmoid
(
gate
);
// On the MUSA platform, `__low2float()` and `__high2float()` are used to directly
// extract and convert bfloat16 values to float. These functions replace the
// two-step process used in CUDA (e.g., `__low2bfloat16` followed by `__bfloat162float`).
// Since MUSA may not support '__low2bfloat16'
float
gate0
=
__low2float
(
gate
);
float
gate1
=
__high2float
(
gate
);
float
sig0
=
__low2float
(
sig
);
float
sig1
=
__high2float
(
sig
);
float
up0
=
__low2float
(
up
);
float
up1
=
__high2float
(
up
);
float
res0
=
__fmul_rn
(
__fmul_rn
(
gate0
,
sig0
),
up0
);
float
res1
=
__fmul_rn
(
__fmul_rn
(
gate1
,
sig1
),
up1
);
return
__floats2bfloat162_rn
(
res0
,
res1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
cuda_bfloat16
sig
=
sigmoid
(
gate
);
float
gatef
=
__bfloat162float
(
gate
);
float
sigf
=
__bfloat162float
(
sig
);
float
upf
=
__bfloat162float
(
up
);
return
__float2bfloat16_rn
(
__fmul_rn
(
__fmul_rn
(
gatef
,
sigf
),
upf
));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
__fmul_rn
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
}
}
SwiGLUOp
;
}
// namespace op::swiglu::cuda
#endif // __SWIGLU_CUDA_H__
src/infiniop/ops/swiglu/moore/swiglu_moore.h
0 → 100644
View file @
8b760951
#ifndef __SWIGLU_MOORE_API_H__
#define __SWIGLU_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
moore
)
#endif // __SWIGLU_MOORE_API_H__
src/infiniop/ops/swiglu/moore/swiglu_moore.mu
0 → 100644
View file @
8b760951
#include "swiglu_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "siwglu_moore_kernel.h"
namespace op::swiglu::moore {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::moore
src/infiniop/ops/swiglu/operator.cc
View file @
8b760951
...
...
@@ -14,9 +14,15 @@
#ifdef ENABLE_METAX_API
#include "metax/swiglu_metax.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/swiglu_bang.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/swiglu_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -51,27 +57,14 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateSwiGLUDescriptor
((
BangHandle_t
)
handle
,
(
SwiGLUBangDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
}
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateSwiGLUDescriptor
((
MacaHandle_t
)
handle
,
(
SwiGLUMacaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaCreateSwiGLUDescriptor
(
handle
,
(
SwiGLUMusaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
...
...
@@ -104,18 +97,14 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
}
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetSwiGLUWorkspaceSize
((
SwiGLUMusaDescriptor_t
)
desc
,
size
);
}
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
...
...
@@ -155,21 +144,14 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangSwiGLU
((
SwiGLUBangDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
}
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
return
macaSwiGLU
((
SwiGLUMacaDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaSwiGLU
((
SwiGLUMusaDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
...
...
@@ -204,21 +186,14 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroySwiGLUDescriptor
((
SwiGLUBangDescriptor_t
)
desc
);
}
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
return
macaDestroySwiGLUDescriptor
((
SwiGLUMacaDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaDestroySwiGLUDescriptor
((
SwiGLUMusaDescriptor_t
)
desc
);
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
...
...
src/infinirt/infinirt.cc
View file @
8b760951
...
...
@@ -6,7 +6,7 @@
#include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "metax/infinirt_metax.h"
#include "m
usa
/infinirt_m
usa
.h"
#include "m
oore
/infinirt_m
oore
.h"
thread_local
infiniDevice_t
CURRENT_DEVICE_TYPE
=
INFINI_DEVICE_CPU
;
thread_local
int
CURRENT_DEVICE_ID
=
0
;
...
...
src/infinirt/m
usa
/infinirt_m
usa
.cc
→
src/infinirt/m
oore
/infinirt_m
oore
.cc
View file @
8b760951
#include "infinirt_m
usa
.h"
#include "infinirt_m
oore
.h"
#include "../../utils.h"
#include <musa_runtime.h>
#include <musa_runtime_api.h>
...
...
src/infinirt/m
usa
/infinirt_m
usa
.h
→
src/infinirt/m
oore
/infinirt_m
oore
.h
View file @
8b760951
File moved
test/infiniop-test/test_generate/infiniop_test.py
View file @
8b760951
...
...
@@ -15,7 +15,7 @@ def np_dtype_to_ggml(tensor_dtype: np.dtype):
return
GGMLQuantizationType
.
F32
elif
tensor_dtype
==
np
.
float64
:
return
GGMLQuantizationType
.
F64
elif
tensor_dtype
==
np
.
bool
:
elif
tensor_dtype
==
np
.
bool
_
:
return
GGMLQuantizationType
.
Q8_K
elif
tensor_dtype
==
np
.
int8
:
return
GGMLQuantizationType
.
I8
...
...
test/infiniop/libinfiniop/utils.py
View file @
8b760951
...
...
@@ -605,11 +605,9 @@ def get_test_devices(args):
def
get_sync_func
(
device
):
import
torch
device_str
=
torch_device_map
[
device
]
if
device
==
InfiniDeviceEnum
.
CPU
:
if
device
==
InfiniDeviceEnum
.
CPU
or
device
==
InfiniDeviceEnum
.
CAMBRICON
:
sync
=
None
else
:
sync
=
getattr
(
torch
,
device_
str
).
synchronize
sync
=
getattr
(
torch
,
torch_
device_
map
[
device
]
).
synchronize
return
sync
test/infiniop/swiglu.py
View file @
8b760951
...
...
@@ -64,8 +64,8 @@ _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
BF16
:
{
"atol"
:
5e-3
,
"rtol"
:
5
e-
3
},
InfiniDtype
.
F32
:
{
"atol"
:
2
e-
7
,
"rtol"
:
1e-
7
},
InfiniDtype
.
BF16
:
{
"atol"
:
5e-3
,
"rtol"
:
1
e-
2
},
InfiniDtype
.
F32
:
{
"atol"
:
1
e-
5
,
"rtol"
:
1e-
5
},
}
DEBUG
=
False
...
...
xmake.lua
View file @
8b760951
...
...
@@ -119,7 +119,7 @@ option_end()
if
has_config
(
"moore-gpu"
)
then
add_defines
(
"ENABLE_MOORE_API"
)
includes
(
"xmake/m
usa
.lua"
)
includes
(
"xmake/m
oore
.lua"
)
end
-- 海光
...
...
xmake/bang.lua
View file @
8b760951
local
NEUWARE_HOME
=
os.getenv
(
"NEUWARE_HOME"
)
or
"/usr/local/neuware"
add_includedirs
(
path
.
join
(
NEUWARE_HOME
,
"include"
))
add_includedirs
(
path
.
join
(
NEUWARE_HOME
,
"include"
)
,
{
public
=
true
}
)
add_linkdirs
(
path
.
join
(
NEUWARE_HOME
,
"lib64"
))
add_linkdirs
(
path
.
join
(
NEUWARE_HOME
,
"lib"
))
add_links
(
"libcnrt.so"
)
...
...
xmake/m
usa
.lua
→
xmake/m
oore
.lua
View file @
8b760951
...
...
@@ -42,8 +42,8 @@ target("infiniop-moore")
set_languages
(
"cxx17"
)
set_warnings
(
"all"
,
"error"
)
add_cxflags
(
"-lstdc++"
,
"-fPIC"
,
"-Wno-comment"
)
add_files
(
"../src/infiniop/devices/m
usa
/*.cc"
)
add_files
(
"../src/infiniop/ops/*/m
usa
/*.mu"
,
{
rule
=
"mu"
})
add_files
(
"../src/infiniop/devices/m
oore
/*.cc"
)
add_files
(
"../src/infiniop/ops/*/m
oore
/*.mu"
,
{
rule
=
"mu"
})
target_end
()
target
(
"infinirt-moore"
)
...
...
@@ -53,5 +53,5 @@ target("infinirt-moore")
add_deps
(
"infini-utils"
)
set_warnings
(
"all"
,
"error"
)
add_cxflags
(
"-lstdc++"
,
"-fPIC"
)
add_files
(
"../src/infinirt/m
usa
/*.cc"
)
add_files
(
"../src/infinirt/m
oore
/*.cc"
)
target_end
()
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment