Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
00c30a4a
Unverified
Commit
00c30a4a
authored
Aug 20, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 20, 2025
Browse files
Merge branch 'main' into issue/240
parents
724d9692
d4b03cf7
Changes
25
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
779 additions
and
61 deletions
+779
-61
src/infiniop/devices/handle.cc
src/infiniop/devices/handle.cc
+3
-3
src/infiniop/devices/moore/moore_common.h
src/infiniop/devices/moore/moore_common.h
+3
-3
src/infiniop/devices/moore/moore_handle.cc
src/infiniop/devices/moore/moore_handle.cc
+3
-3
src/infiniop/devices/moore/moore_handle.h
src/infiniop/devices/moore/moore_handle.h
+5
-5
src/infiniop/devices/moore/moore_kernel_common.h
src/infiniop/devices/moore/moore_kernel_common.h
+79
-0
src/infiniop/elementwise/moore/elementwise_moore.h
src/infiniop/elementwise/moore/elementwise_moore.h
+264
-0
src/infiniop/elementwise/moore/elementwise_moore_api.h
src/infiniop/elementwise/moore/elementwise_moore_api.h
+59
-0
src/infiniop/ops/gemm/moore/gemm_moore.h
src/infiniop/ops/gemm/moore/gemm_moore.h
+8
-0
src/infiniop/ops/gemm/moore/gemm_moore.mu
src/infiniop/ops/gemm/moore/gemm_moore.mu
+125
-0
src/infiniop/ops/gemm/musa/gemm_musa.h
src/infiniop/ops/gemm/musa/gemm_musa.h
+0
-8
src/infiniop/ops/gemm/operator.cc
src/infiniop/ops/gemm/operator.cc
+5
-5
src/infiniop/ops/rms_norm/cuda/kernel.cuh
src/infiniop/ops/rms_norm/cuda/kernel.cuh
+1
-1
src/infiniop/ops/rms_norm/moore/rms_norm_moore.h
src/infiniop/ops/rms_norm/moore/rms_norm_moore.h
+8
-0
src/infiniop/ops/rms_norm/moore/rms_norm_moore.mu
src/infiniop/ops/rms_norm/moore/rms_norm_moore.mu
+43
-17
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+5
-5
src/infiniop/ops/swiglu/moore/siwglu_moore_kernel.h
src/infiniop/ops/swiglu/moore/siwglu_moore_kernel.h
+86
-0
src/infiniop/ops/swiglu/moore/swiglu_moore.h
src/infiniop/ops/swiglu/moore/swiglu_moore.h
+8
-0
src/infiniop/ops/swiglu/moore/swiglu_moore.mu
src/infiniop/ops/swiglu/moore/swiglu_moore.mu
+61
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+12
-10
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+1
-1
No files found.
src/infiniop/devices/handle.cc
View file @
00c30a4a
...
...
@@ -15,7 +15,7 @@
#include "ascend/ascend_handle.h"
#endif
#ifdef ENABLE_MOORE_API
#include "m
usa/musa
_handle.h"
#include "m
oore/moore
_handle.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/kunlun_handle.h"
...
...
@@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
m
usa
);
CREATE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -94,7 +94,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
m
usa
);
DELETE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
src/infiniop/devices/m
usa/
common
_musa
.h
→
src/infiniop/devices/m
oore/moore_
common.h
View file @
00c30a4a
#include "../../../utils.h"
#include "../pool.h"
#include "m
usa
_handle.h"
#include "m
oore
_handle.h"
#include <mublas.h>
#include <mudnn.h>
#include <musa.h>
...
...
@@ -10,7 +10,7 @@
#define CHECK_MUBLAS(API) CHECK_INTERNAL(API, MUBLAS_STATUS_SUCCESS)
#define CHECK_MUDNN(API) CHECK_INTERNAL((int)API, (int)::musa::dnn::Status::SUCCESS)
namespace
device
::
m
usa
{
namespace
device
::
m
oore
{
class
Handle
::
Internal
{
Pool
<
std
::
unique_ptr
<
mublasHandle_t
>>
mublas_handles
;
...
...
@@ -39,4 +39,4 @@ public:
int
gridSizeZ
()
const
;
};
}
// namespace device::m
usa
}
// namespace device::m
oore
src/infiniop/devices/m
usa/musa
_handle.cc
→
src/infiniop/devices/m
oore/moore
_handle.cc
View file @
00c30a4a
#include "common
_musa
.h"
#include "
moore_
common.h"
namespace
device
::
m
usa
{
namespace
device
::
m
oore
{
Handle
::
Handle
(
infiniDevice_t
device
,
int
device_id
)
:
InfiniopHandle
{
device
,
device_id
},
_internal
(
std
::
make_shared
<
Handle
::
Internal
>
(
device_id
))
{}
...
...
@@ -67,4 +67,4 @@ infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace device::m
usa
}
// namespace device::m
oore
src/infiniop/devices/m
usa/musa
_handle.h
→
src/infiniop/devices/m
oore/moore
_handle.h
View file @
00c30a4a
#ifndef __INFINIOP_M
USA
_HANDLE_H__
#define __INFINIOP_M
USA
_HANDLE_H__
#ifndef __INFINIOP_M
OORE
_HANDLE_H__
#define __INFINIOP_M
OORE
_HANDLE_H__
#include "../../handle.h"
#include <memory>
namespace
device
::
m
usa
{
namespace
device
::
m
oore
{
struct
Handle
:
public
InfiniopHandle
{
Handle
(
int
device_id
);
class
Internal
;
...
...
@@ -20,6 +20,6 @@ private:
std
::
shared_ptr
<
Internal
>
_internal
;
};
}
// namespace device::m
usa
}
// namespace device::m
oore
#endif // __INFINIOP_M
USA
_HANDLE_H__
#endif // __INFINIOP_M
OORE
_HANDLE_H__
src/infiniop/devices/moore/moore_kernel_common.h
0 → 100644
View file @
00c30a4a
#define INFINIOP_MOORE_KERNEL __global__ void
#include <musa_bf16.h>
#include <musa_fp16.h>
// Posible maximum number of threads per block for MUSA architectures
// Used for picking correct kernel launch configuration
#define MOORE_BLOCK_SIZE_2048 2048
#define MOORE_BLOCK_SIZE_1024 1024
#define MOORE_BLOCK_SIZE_512 512
#define CHECK_MOORE(API) CHECK_INTERNAL(API, musaSuccess)
using
cuda_bfloat16
=
mt_bfloat16
;
using
cuda_bfloat162
=
mt_bfloat162
;
namespace
device
::
moore
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
}
// namespace device::moore
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
}
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
exp
(
val
);
}
__forceinline__
__device__
double
exp_
(
const
double
val
)
{
return
exp
(
val
);
}
// <musa_bf16.h> may not support hexp
__forceinline__
__device__
__half
exp_
(
const
__half
x
)
{
float
f_val
=
__half2float
(
x
);
float
f_result
=
expf
(
f_val
);
return
__float2half
(
f_result
);
}
// <musa_bf16.h> may not support hexp
__forceinline__
__device__
__mt_bfloat16
exp_
(
const
__mt_bfloat16
x
)
{
float
f_val
=
__bfloat162float
(
x
);
float
f_result
=
expf
(
f_val
);
return
__float2bfloat16
(
f_result
);
}
src/infiniop/elementwise/moore/elementwise_moore.h
0 → 100644
View file @
00c30a4a
#ifndef __INFINIOP_ELEMENTWISE_MOORE_H__
#define __INFINIOP_ELEMENTWISE_MOORE_H__
#include "../../../utils.h"
#include "../../devices/moore/moore_common.h"
#include "../../devices/moore/moore_kernel_common.h"
#include "elementwise_moore_api.h"
namespace
op
::
elementwise
::
moore
{
template
<
typename
T
>
__device__
__forceinline__
const
T
*
typedInputPtr
(
const
void
*
ptr
)
{
return
reinterpret_cast
<
const
T
*>
(
ptr
);
}
__device__
__forceinline__
size_t
getOutputIndex
(
size_t
idx
,
bool
is_contiguous
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
return
is_contiguous
?
idx
:
device
::
moore
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
}
struct
InputIndexer
{
size_t
idx
;
size_t
ndim
;
const
bool
*
input_contiguous
;
const
bool
*
input_broadcasted
;
const
size_t
*
input_shapes
;
const
ptrdiff_t
*
input_strides
;
const
ptrdiff_t
*
output_strides
;
__device__
__forceinline__
size_t
operator
()(
size_t
input_id
)
const
{
return
input_contiguous
[
input_id
]
?
idx
:
(
input_broadcasted
[
input_id
]
?
device
::
moore
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
moore
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
}
};
template
<
typename
F
,
size_t
...
Is
>
__device__
__forceinline__
void
unpackInputsAndApply
(
F
&&
f
,
std
::
index_sequence
<
Is
...
>
)
{
f
(
std
::
integral_constant
<
size_t
,
Is
>
{}...);
}
template
<
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
INFINIOP_MOORE_KERNEL
elementwiseKernel
(
size_t
output_size
,
size_t
ndim
,
bool
output_contiguous
,
const
bool
*
__restrict__
input_contiguous
,
const
bool
*
__restrict__
input_broadcasted
,
const
size_t
*
__restrict__
output_shape
,
const
size_t
*
__restrict__
input_shapes
,
const
ptrdiff_t
*
__restrict__
output_strides
,
const
ptrdiff_t
*
__restrict__
input_strides
,
Tdata
*
output
,
const
void
*
const
*
inputs
,
size_t
offset
,
Args
...
args
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
+
offset
;
if
(
idx
<
output_size
)
{
const
Tdata
*
const
*
typed_inputs
=
reinterpret_cast
<
const
Tdata
*
const
*>
(
inputs
);
size_t
out_idx
=
getOutputIndex
(
idx
,
output_contiguous
,
ndim
,
output_shape
,
output_strides
);
InputIndexer
indexer
{
idx
,
ndim
,
input_contiguous
,
input_broadcasted
,
input_shapes
,
input_strides
,
output_strides
};
unpackInputsAndApply
(
[
&
](
auto
...
Is
)
{
output
[
out_idx
]
=
Op
{}(
typed_inputs
[
Is
.
value
][
indexer
(
Is
.
value
)]...,
std
::
forward
<
Args
>
(
args
)...);
},
std
::
make_index_sequence
<
N
>
{});
}
}
template
<
typename
Op
,
typename
Tout
,
typename
...
Tin
>
INFINIOP_MOORE_KERNEL
elementwiseKernel
(
size_t
output_size
,
size_t
ndim
,
bool
output_contiguous
,
const
bool
*
__restrict__
input_contiguous
,
const
bool
*
__restrict__
input_broadcasted
,
const
size_t
*
__restrict__
output_shape
,
const
size_t
*
__restrict__
input_shapes
,
const
ptrdiff_t
*
__restrict__
output_strides
,
const
ptrdiff_t
*
__restrict__
input_strides
,
Tout
*
output
,
const
void
*
const
*
__restrict__
inputs
,
size_t
offset
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
+
offset
;
if
(
idx
<
output_size
)
{
size_t
out_idx
=
getOutputIndex
(
idx
,
output_contiguous
,
ndim
,
output_shape
,
output_strides
);
InputIndexer
indexer
{
idx
,
ndim
,
input_contiguous
,
input_broadcasted
,
input_shapes
,
input_strides
,
output_strides
};
unpackInputsAndApply
(
[
&
](
auto
...
Is
)
{
output
[
out_idx
]
=
Op
{}.
template
operator
()
<
Tout
,
Tin
...>(
(
typedInputPtr
<
Tin
>
(
inputs
[
Is
.
value
])[
indexer
(
Is
.
value
)])...);
},
std
::
index_sequence_for
<
Tin
...
>
{});
}
}
struct
DeviceImpl
::
Opaque
{
std
::
shared_ptr
<
device
::
moore
::
Handle
::
Internal
>
internal
;
Opaque
(
const
std
::
shared_ptr
<
device
::
moore
::
Handle
::
Internal
>
&
internal
)
:
internal
(
internal
)
{}
template
<
uint32_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
calculateImpl
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
musaStream_t
stream
,
Args
&&
...
args
)
{
return
launchElementwiseKernel
<
BLOCK_SIZE
,
N
>
(
info
,
workspace
,
reinterpret_cast
<
Tdata
*>
(
output
),
inputs
,
elementwiseKernel
<
N
,
Op
,
Tdata
,
Args
...
>
,
stream
,
std
::
forward
<
Args
>
(
args
)...);
}
template
<
uint32_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
=
0
>
infiniStatus_t
calculateImpl
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
musaStream_t
stream
,
Args
&&
...
args
)
{
return
launchElementwiseKernel
<
BLOCK_SIZE
,
N
>
(
info
,
workspace
,
reinterpret_cast
<
Tout
*>
(
output
),
inputs
,
elementwiseKernel
<
Op
,
Tout
,
Tin
...
>
,
stream
);
}
private:
template
<
size_t
N
>
infiniStatus_t
infoToDevice
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
const
void
*
const
*
h_inputs_arr
,
const
void
**&
d_inputs_arr
,
const
bool
*&
d_input_contiguous
,
const
bool
*&
d_input_broadcasted
,
const
size_t
*&
d_output_shape
,
const
ptrdiff_t
*&
d_output_strides
,
const
size_t
*&
d_input_shapes
,
const
ptrdiff_t
*&
d_input_strides
,
musaStream_t
stream
)
const
{
constexpr
auto
input_size
=
N
;
const
auto
ndim
=
info
.
getNdim
();
constexpr
auto
input_arr_size
=
N
*
sizeof
(
*
h_inputs_arr
);
const
int8_t
*
info_meta_start
=
info
.
getMetaStart
();
const
int8_t
*
d_meta_start
=
reinterpret_cast
<
int8_t
*>
(
workspace
)
+
input_arr_size
;
// copy the input pointer array and meta to device
CHECK_MOORE
(
musaMemcpyAsync
(
workspace
,
h_inputs_arr
,
input_arr_size
,
musaMemcpyHostToDevice
,
stream
));
CHECK_MOORE
(
musaMemcpyAsync
((
void
*
)
d_meta_start
,
info_meta_start
,
info
.
getMetaMemSize
(),
musaMemcpyHostToDevice
,
stream
));
// offset/assign the pointers
d_inputs_arr
=
reinterpret_cast
<
const
void
**>
(
workspace
);
d_output_shape
=
reinterpret_cast
<
const
size_t
*>
(
d_meta_start
);
d_output_strides
=
reinterpret_cast
<
const
ptrdiff_t
*>
(
d_output_shape
+
ndim
);
d_input_shapes
=
reinterpret_cast
<
const
size_t
*>
(
d_output_strides
+
ndim
);
d_input_strides
=
reinterpret_cast
<
const
ptrdiff_t
*>
(
d_input_shapes
+
input_size
*
ndim
);
d_input_contiguous
=
reinterpret_cast
<
const
bool
*>
(
d_input_strides
+
input_size
*
ndim
);
d_input_broadcasted
=
reinterpret_cast
<
const
bool
*>
(
d_input_contiguous
+
input_size
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
uint32_t
BLOCK_SIZE
,
size_t
N
,
typename
KernelFunc
,
typename
Tout
,
typename
...
Args
>
infiniStatus_t
launchElementwiseKernel
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
Tout
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
KernelFunc
kernel_func
,
musaStream_t
stream
,
Args
&&
...
args
)
{
auto
output_size
=
info
.
getOutputSize
();
if
(
output_size
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
// Device pointers
const
void
**
d_inputs_arr
=
nullptr
;
const
bool
*
d_input_contiguous
=
nullptr
;
const
bool
*
d_input_broadcasted
=
nullptr
;
const
size_t
*
d_output_shape
=
nullptr
;
const
ptrdiff_t
*
d_output_strides
=
nullptr
;
const
size_t
*
d_input_shapes
=
nullptr
;
const
ptrdiff_t
*
d_input_strides
=
nullptr
;
CHECK_STATUS
(
infoToDevice
<
N
>
(
info
,
workspace
,
inputs
.
data
(),
d_inputs_arr
,
d_input_contiguous
,
d_input_broadcasted
,
d_output_shape
,
d_output_strides
,
d_input_shapes
,
d_input_strides
,
stream
));
dim3
blockDims
(
std
::
min
(
BLOCK_SIZE
,
static_cast
<
uint32_t
>
(
internal
->
maxThreadsPerBlock
())));
dim3
gridDims
(
std
::
min
(
uint32_t
(
CEIL_DIV
(
output_size
,
blockDims
.
x
)),
static_cast
<
uint32_t
>
(
internal
->
gridSizeX
())));
size_t
step
=
gridDims
.
x
*
blockDims
.
x
;
for
(
size_t
i
=
0
;
i
<
output_size
;
i
+=
step
)
{
kernel_func
<<<
gridDims
,
blockDims
,
0
,
stream
>>>
(
output_size
,
info
.
getNdim
(),
info
.
isOutputContiguous
(),
d_input_contiguous
,
d_input_broadcasted
,
d_output_shape
,
d_input_shapes
,
d_output_strides
,
d_input_strides
,
output
,
reinterpret_cast
<
const
void
**>
(
d_inputs_arr
),
i
,
std
::
forward
<
Args
>
(
args
)...);
}
return
INFINI_STATUS_SUCCESS
;
}
};
template
<
typename
...
Args
>
utils
::
Result
<
DeviceImpl
*>
DeviceImpl
::
create
(
Args
&&
...
args
)
{
auto
opaque
=
std
::
make_shared
<
Opaque
>
(
std
::
forward
<
Args
>
(
args
)...);
return
utils
::
Result
<
DeviceImpl
*>
(
new
DeviceImpl
(
opaque
));
}
/* Invoke elementwise operation for different input types */
template
<
uint32_t
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
>
infiniStatus_t
DeviceImpl
::
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
)
{
constexpr
size_t
N
=
Op
::
num_inputs
;
static_assert
(
sizeof
...(
Tin
)
==
N
,
"Input type count mismatch"
);
return
_opaque
->
calculateImpl
<
BLOCK_SIZE
,
N
,
Op
,
Tout
,
Tin
...
>
(
info
,
workspace
,
output
,
inputs
,
reinterpret_cast
<
musaStream_t
>
(
stream
),
std
::
forward
<
Args
>
(
args
)...);
}
/* Invoke elementwise operation when all inputs have the same dtype */
template
<
uint32_t
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
DeviceImpl
::
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
)
{
constexpr
size_t
N
=
Op
::
num_inputs
;
return
_opaque
->
calculateImpl
<
BLOCK_SIZE
,
N
,
Op
,
Tdata
>
(
info
,
workspace
,
output
,
inputs
,
reinterpret_cast
<
musaStream_t
>
(
stream
),
std
::
forward
<
Args
>
(
args
)...);
}
}
// namespace op::elementwise::moore
#endif
src/infiniop/elementwise/moore/elementwise_moore_api.h
0 → 100644
View file @
00c30a4a
#ifndef __INFINIOP_ELEMENTWISE_MOORE_API_H__
#define __INFINIOP_ELEMENTWISE_MOORE_API_H__
#include "../elementwise.h"
namespace
op
::
elementwise
::
moore
{
class
DeviceImpl
final
{
struct
Opaque
;
std
::
shared_ptr
<
Opaque
>
_opaque
;
DeviceImpl
(
std
::
shared_ptr
<
Opaque
>
opaque
)
:
_opaque
(
std
::
move
(
opaque
))
{}
public:
~
DeviceImpl
()
=
default
;
template
<
typename
...
Args
>
static
utils
::
Result
<
DeviceImpl
*>
create
(
Args
&&
...
args
);
template
<
uint32_t
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
);
template
<
uint32_t
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
=
0
>
infiniStatus_t
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
Args
&&
...
args
);
};
}
// namespace op::elementwise::moore
#define CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
\
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
CHECK_RESULT(info_result); \
auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\
auto device_impl_result = op::elementwise::moore::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
\
*desc_ptr = new Descriptor( \
DTYPE, \
std::move(info), \
std::move(device_impl_result.take()), \
workspace_size, \
HANDLE->device, \
HANDLE->device_id);
#endif // __INFINIOP_ELEMENTWISE_MOORE_API_H__
src/infiniop/ops/gemm/moore/gemm_moore.h
0 → 100644
View file @
00c30a4a
#ifndef __GEMM_MOORE_H__
#define __GEMM_MOORE_H__
#include "../gemm.h"
DESCRIPTOR
(
moore
)
#endif // __GEMM_MOORE_H__
src/infiniop/ops/gemm/m
usa
/gemm_m
usa
.mu
→
src/infiniop/ops/gemm/m
oore
/gemm_m
oore
.mu
View file @
00c30a4a
#include "../../../devices/m
usa/
common
_musa
.h"
#include "../../../devices/m
usa/musa
_handle.h"
#include "gemm_m
usa
.h"
#include "../../../devices/m
oore/moore_
common.h"
#include "../../../devices/m
oore/moore
_handle.h"
#include "gemm_m
oore
.h"
namespace op::gemm::m
usa
{
namespace op::gemm::m
oore
{
struct Descriptor::Opaque {
std::shared_ptr<device::m
usa
::Handle::Internal> internal;
std::shared_ptr<device::m
oore
::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
...
...
@@ -18,10 +18,10 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<device::m
usa
::Handle *>(handle_);
auto handle = reinterpret_cast<device::m
oore
::Handle *>(handle_);
auto dtype = c_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32
, INFINI_DTYPE_BF16
);
auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
CHECK_RESULT(result);
...
...
@@ -33,41 +33,63 @@ infiniStatus_t Descriptor::create(
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata>
infiniStatus_t calculate(
const MatmulInfo &info,
std::shared_ptr<device::musa::Handle::Internal> &_internal,
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *c,
float beta,
const void *a,
const void *b,
float alpha,
void *stream) {
void *stream)
const
{
musaDataType a_type, b_type, c_type;
mublasComputeType_t compute_type;
Tdata alpha_, beta_;
if constexpr (std::is_same<Tdata, half>::value) {
alpha_ = __float2half(alpha);
beta_ = __float2half(beta);
// MUSA's GEMM operations require that the scalar values alpha and beta have the same data type as the matrices.
// This ensures correct computation during the muBLAS GEMM operation.
// Declare half-precision variables to handle F16 types.
half alpha_h, beta_h;
// Initialize generic void pointers for alpha and beta.
// They point to the original float values
// It will be used directly when the GEMM operation is performed with F32 data.
const void *p_alpha = α
const void *p_beta = β
switch (_dtype) {
case INFINI_DTYPE_F16:
a_type = b_type = c_type = MUSA_R_16F;
compute_type = MUBLAS_COMPUTE_16F;
} else {
alpha_ = alpha;
beta_ = beta;
// Convert alpha/beta to half-precision and update the pointers.
alpha_h = __float2half(alpha);
beta_h = __float2half(beta);
p_alpha = &alpha_h;
p_beta = &beta_h;
break;
case INFINI_DTYPE_BF16:
a_type = b_type = c_type = MUSA_R_16BF;
compute_type = MUBLAS_COMPUTE_32F;
break;
case INFINI_DTYPE_F32:
a_type = b_type = c_type = MUSA_R_32F;
compute_type = MUBLAS_COMPUTE_32F_FAST_TF32;
break;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
if (info.is_transed) {
if (
_
info.is_transed) {
std::swap(a, b);
}
auto op_a = info.a_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T;
auto op_b = info.b_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T;
auto op_a =
_
info.a_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T;
auto op_b =
_
info.b_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T;
CHECK_STATUS(_internal->useMublas(
CHECK_STATUS(_
opaque->
internal->useMublas(
(musaStream_t)stream,
[&](mublasHandle_t handle) {
CHECK_MUBLAS(
...
...
@@ -75,24 +97,24 @@ infiniStatus_t calculate(
handle,
op_a,
op_b,
static_cast<int>(info.m),
static_cast<int>(info.n),
static_cast<int>(info.k),
&
alpha
_
,
static_cast<int>(
_
info.m),
static_cast<int>(
_
info.n),
static_cast<int>(
_
info.k),
p_
alpha,
a,
a_type,
static_cast<int>(info.a_matrix.ld()),
info.a_matrix.stride,
static_cast<int>(
_
info.a_matrix.ld()),
_
info.a_matrix.stride,
b,
b_type,
static_cast<int>(info.b_matrix.ld()),
info.b_matrix.stride,
&
beta
_
,
static_cast<int>(
_
info.b_matrix.ld()),
_
info.b_matrix.stride,
p_
beta,
c,
c_type,
static_cast<int>(info.c_matrix.ld()),
info.c_matrix.stride,
static_cast<int>(info.batch),
static_cast<int>(
_
info.c_matrix.ld()),
_
info.c_matrix.stride,
static_cast<int>(
_
info.batch),
compute_type,
MUBLAS_GEMM_DEFAULT));
return INFINI_STATUS_SUCCESS;
...
...
@@ -100,22 +122,4 @@ infiniStatus_t calculate(
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace,
size_t workspace_size,
void *c,
float beta,
const void *a,
const void *b,
float alpha,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return musa::calculate<half>(_info, _opaque->internal, c, beta, a, b, alpha, stream);
case INFINI_DTYPE_F32:
return musa::calculate<float>(_info,_opaque->internal, c, beta, a, b, alpha, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::gemm::musa
} // namespace op::gemm::moore
src/infiniop/ops/gemm/musa/gemm_musa.h
deleted
100644 → 0
View file @
724d9692
#ifndef __GEMM_MUSA_H__
#define __GEMM_MUSA_H__
#include "../gemm.h"
DESCRIPTOR
(
musa
)
#endif // __GEMM_MUSA_H__
src/infiniop/ops/gemm/operator.cc
View file @
00c30a4a
...
...
@@ -18,7 +18,7 @@
#include "metax/gemm_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "m
usa
/gemm_m
usa
.h"
#include "m
oore
/gemm_m
oore
.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/gemm_kunlun.h"
...
...
@@ -61,7 +61,7 @@ __C infiniStatus_t infiniopCreateGemmDescriptor(
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
m
usa
);
CREATE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
#ifdef ENABLE_KUNLUN_API
...
...
@@ -106,7 +106,7 @@ infiniopGetGemmWorkspaceSize(
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
m
usa
);
GET
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -158,7 +158,7 @@ __C infiniStatus_t infiniopGemm(
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
m
usa
);
CALCULATE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -200,7 +200,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
m
usa
);
DELETE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
src/infiniop/ops/rms_norm/cuda/kernel.cuh
View file @
00c30a4a
...
...
@@ -22,7 +22,7 @@ __device__ void rmsnormBlock(
// Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
__shared__
Tcompute
rms
;
if
(
threadIdx
.
x
==
0
)
{
rms
=
T
data
(
rsqrtf
(
ss
/
Tcompute
(
dim
)
+
epsilon
));
rms
=
T
compute
(
rsqrtf
(
ss
/
Tcompute
(
dim
)
+
epsilon
));
}
__syncthreads
();
...
...
src/infiniop/ops/rms_norm/m
usa
/rms_norm_m
usa.cu
h
→
src/infiniop/ops/rms_norm/m
oore
/rms_norm_m
oore.
h
View file @
00c30a4a
#ifndef __RMS_NORM_M
USA_CU
H__
#define __RMS_NORM_M
USA_CU
H__
#ifndef __RMS_NORM_M
OORE_
H__
#define __RMS_NORM_M
OORE_
H__
#include "../rms_norm.h"
DESCRIPTOR
(
m
usa
)
DESCRIPTOR
(
m
oore
)
#endif
src/infiniop/ops/rms_norm/m
usa
/rms_norm_m
usa
.mu
→
src/infiniop/ops/rms_norm/m
oore
/rms_norm_m
oore
.mu
View file @
00c30a4a
#include "../../../devices/musa/common_musa.h"
#include "../cuda/rms_norm_kernel.cuh"
#include "rms_norm_musa.cuh"
#include "../../../devices/moore/moore_common.h"
#include "rms_norm_moore.h"
namespace op::rms_norm::musa {
#include "../../../devices/moore/moore_kernel_common.h"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
INFINIOP_MOORE_KERNEL rmsnormKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y,
const Tdata *__restrict__ x,
ptrdiff_t stride_x,
const Tweight *__restrict__ w,
size_t dim,
float epsilon) {
rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
}
namespace op::rms_norm::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::m
usa
::Handle::Internal> internal;
std::shared_ptr<device::m
oore
::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
...
...
@@ -29,7 +47,7 @@ infiniStatus_t Descriptor::create(
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::m
usa
::Handle *>(handle)->internal()},
new Opaque{reinterpret_cast<device::m
oore
::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
...
...
@@ -46,20 +64,24 @@ infiniStatus_t launchKernel(
float epsilon,
musaStream_t musa_stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnorm
Block
<BLOCK_SIZE, Tdata, Tweight
, Tcompute
><<<batch_size, BLOCK_SIZE, 0, musa_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)
\
rmsnorm
Kernel
<BLOCK_SIZE,
Tcompute,
Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, musa_stream>>>( \
reinterpret_cast<Tdata *>(y),
\
stride_y,
\
reinterpret_cast<const Tdata *>(x),
\
stride_x,
\
reinterpret_cast<const Tweight *>(w),
\
dim,
\
epsilon)
if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, half, float);
} else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(half, float, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__mt_bfloat16, __mt_bfloat16, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(__mt_bfloat16, float, float);
} else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float, float);
} else {
...
...
@@ -87,11 +109,15 @@ infiniStatus_t Descriptor::calculate(
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
// launch kernel with different block sizes
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, musa_stream));
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, musa_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rms_norm::m
usa
} // namespace op::rms_norm::m
oore
src/infiniop/ops/rms_norm/operator.cc
View file @
00c30a4a
...
...
@@ -18,7 +18,7 @@
#include "metax/rms_norm_metax.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "m
usa
/rms_norm_m
usa.cu
h"
#include "m
oore
/rms_norm_m
oore.
h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/rms_norm_kunlun.h"
...
...
@@ -65,7 +65,7 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
m
usa
);
CREATE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
}
...
...
@@ -104,7 +104,7 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
m
usa
);
GET
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
}
...
...
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
m
usa
);
CALCULATE
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
}
...
...
@@ -183,7 +183,7 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
m
usa
);
DESTROY
(
INFINI_DEVICE_MOORE
,
m
oore
);
#endif
}
...
...
src/infiniop/ops/swiglu/moore/siwglu_moore_kernel.h
0 → 100644
View file @
00c30a4a
#ifndef __SWIGLU_CUDA_H__
#define __SWIGLU_CUDA_H__
/*
* This file contains the SwiGLU operation implementation for the MUSA backend.
*
* It uses the 'op::swiglu::cuda' namespace to maintain a consistent code structure
* and interface with the CUDA implementation, ensuring code alignment across different
* hardware platforms.
*/
namespace
op
::
swiglu
::
cuda
{
typedef
struct
SwiGLUOp
{
private:
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// This implementation uses standard floating-point arithmetic to calculate the sigmoid function,
// ensuring portability across on MUSA platforms.
//
// The original CUDA implementation's reliance on platform-specific intrinsics like hrcp for half-precision,
// which was not supported on the MUSA platform.
// To resolve this, the half-precision input is first converted to a higher-precision float,
// the calculation is performed, and the result is cast back to half.
float
xf
=
__half2float
(
x
);
float
sigf
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
xf
));
return
__float2half
(
sigf
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
float
x0
=
__bfloat162float
(
__low2bfloat16
(
x
));
float
x1
=
__bfloat162float
(
__high2bfloat16
(
x
));
float
sig0
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x0
)));
float
sig1
=
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
x1
)));
return
__floats2bfloat162_rn
(
sig0
,
sig1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
float
xf
=
__bfloat162float
(
x
);
return
__float2bfloat16_rn
(
__frcp_rn
(
__fadd_rn
(
1.0
f
,
__expf
(
-
xf
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rn
(
__fadd_rn
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
}
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
up
,
const
T
&
gate
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
cuda_bfloat162
sig
=
sigmoid
(
gate
);
// On the MUSA platform, `__low2float()` and `__high2float()` are used to directly
// extract and convert bfloat16 values to float. These functions replace the
// two-step process used in CUDA (e.g., `__low2bfloat16` followed by `__bfloat162float`).
// Since MUSA may not support '__low2bfloat16'
float
gate0
=
__low2float
(
gate
);
float
gate1
=
__high2float
(
gate
);
float
sig0
=
__low2float
(
sig
);
float
sig1
=
__high2float
(
sig
);
float
up0
=
__low2float
(
up
);
float
up1
=
__high2float
(
up
);
float
res0
=
__fmul_rn
(
__fmul_rn
(
gate0
,
sig0
),
up0
);
float
res1
=
__fmul_rn
(
__fmul_rn
(
gate1
,
sig1
),
up1
);
return
__floats2bfloat162_rn
(
res0
,
res1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
cuda_bfloat16
sig
=
sigmoid
(
gate
);
float
gatef
=
__bfloat162float
(
gate
);
float
sigf
=
__bfloat162float
(
sig
);
float
upf
=
__bfloat162float
(
up
);
return
__float2bfloat16_rn
(
__fmul_rn
(
__fmul_rn
(
gatef
,
sigf
),
upf
));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
__fmul_rn
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
}
}
SwiGLUOp
;
}
// namespace op::swiglu::cuda
#endif // __SWIGLU_CUDA_H__
src/infiniop/ops/swiglu/moore/swiglu_moore.h
0 → 100644
View file @
00c30a4a
#ifndef __SWIGLU_MOORE_API_H__
#define __SWIGLU_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
moore
)
#endif // __SWIGLU_MOORE_API_H__
src/infiniop/ops/swiglu/moore/swiglu_moore.mu
0 → 100644
View file @
00c30a4a
#include "swiglu_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "siwglu_moore_kernel.h"
namespace op::swiglu::moore {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::moore
src/infiniop/ops/swiglu/operator.cc
View file @
00c30a4a
...
...
@@ -20,6 +20,9 @@
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/swiglu_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -60,10 +63,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaCreateSwiGLUDescriptor
(
handle
,
(
SwiGLUMusaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
...
...
@@ -101,6 +102,9 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
...
...
@@ -146,9 +150,8 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaSwiGLU
((
SwiGLUMusaDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
...
...
@@ -189,9 +192,8 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaDestroySwiGLUDescriptor
((
SwiGLUMusaDescriptor_t
)
desc
);
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
...
...
src/infinirt/infinirt.cc
View file @
00c30a4a
...
...
@@ -6,7 +6,7 @@
#include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "metax/infinirt_metax.h"
#include "m
usa
/infinirt_m
usa
.h"
#include "m
oore
/infinirt_m
oore
.h"
thread_local
infiniDevice_t
CURRENT_DEVICE_TYPE
=
INFINI_DEVICE_CPU
;
thread_local
int
CURRENT_DEVICE_ID
=
0
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment