Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
9c87dbb1
Unverified
Commit
9c87dbb1
authored
Aug 27, 2025
by
spike-zhu
Committed by
GitHub
Aug 27, 2025
Browse files
issue/251: 摩尔平台 rope 算子开发
parent
f2bb97ad
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
237 additions
and
23 deletions
+237
-23
src/infiniop/ops/rope/moore/rope_kernel_moore.h
src/infiniop/ops/rope/moore/rope_kernel_moore.h
+70
-0
src/infiniop/ops/rope/moore/rope_moore.h
src/infiniop/ops/rope/moore/rope_moore.h
+8
-0
src/infiniop/ops/rope/moore/rope_moore.mu
src/infiniop/ops/rope/moore/rope_moore.mu
+144
-0
src/infiniop/ops/rope/operator.cc
src/infiniop/ops/rope/operator.cc
+15
-23
No files found.
src/infiniop/ops/rope/moore/rope_kernel_moore.h
0 → 100644
View file @
9c87dbb1
#ifndef __INFINIOP_ROPE_MOORE_KERNEL_H__
#define __INFINIOP_ROPE_MOORE_KERNEL_H__
/*
* This file contains the RoPE operation implementation for the MUSA backend.
*
* It maintain a consistent code structure and interface with the CUDA implementation,
* which ensuring code alignment across different hardware platforms.
*/
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
__device__
void
ropeThreadPerItemBlock
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
auto
y_offset
=
blockIdx
.
x
*
y_stride_seqlen
+
blockIdx
.
y
*
y_stride_nhead
;
auto
x_offset
=
blockIdx
.
x
*
x_stride_seqlen
+
blockIdx
.
y
*
x_stride_nhead
;
size_t
pos_id
=
size_t
(
pos_ids
[
blockIdx
.
x
]);
auto
table_offset
=
pos_id
*
table_dim
;
for
(
size_t
i
=
threadIdx
.
x
;
i
<
table_dim
;
i
+=
blockDim
.
x
)
{
Tangle
sin__
=
sin_table
[
table_offset
+
i
],
cos__
=
cos_table
[
table_offset
+
i
];
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
/*
* The original code used CUDA-specific functions (__low2bfloat16, __high2bfloat16)
* to extract bfloat16 values from a packed variable.
*
* This code has been modified for the MUSA platform, which does not support
* these CUDA built-in functions. Instead, MUSA provides a different set of
* built-in functions (`__low2float`, `__high2float`) that directly convert
* the bfloat16 values to float.
*
* This change ensures cross-platform compatibility and resolves compilation errors.
*/
Tangle
x0
=
__low2float
(
x
);
Tangle
x1
=
__high2float
(
x
);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y
=
__floats2bfloat162_rn
(
y0
,
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
}
}
}
#endif
src/infiniop/ops/rope/moore/rope_moore.h
0 → 100644
View file @
9c87dbb1
#ifndef __INFINIOP_ROPE_MOORE_H__
#define __INFINIOP_ROPE_MOORE_H__
#include "../rope.h"
DESCRIPTOR
(
moore
)
#endif // __INFINIOP_ROPE_MOORE_H__
src/infiniop/ops/rope/moore/rope_moore.mu
0 → 100644
View file @
9c87dbb1
#include "../../../devices/moore/moore_common.h"
#include "rope_moore.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "rope_kernel_moore.h"
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_MOORE_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
y_stride_seqlen, y_stride_nhead,
x_stride_seqlen, x_stride_nhead);
}
namespace op::rope::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(info);
// Create descriptor
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tindex>
infiniStatus_t calculateRoPE(const RoPEInfo &info,
int block_size,
Tdata *y,
const Tdata *x,
const Tindex *pos_ids,
const Tdata *sin_table,
const Tdata *cos_table,
musaStream_t stream) {
auto dimx = uint32_t(info.seqlen),
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE_ROPE(TDATA, TINDEX) \
calculateRoPE(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(const TINDEX *)pos_ids, \
(const TDATA *)sin_table, \
(const TDATA *)cos_table, \
(musaStream_t)stream)
#define ROPE_TYPE(TDATA) \
switch (_info.pos_type) { \
case INFINI_DTYPE_U8: \
return CALCULATE_ROPE(TDATA, uint8_t); \
case INFINI_DTYPE_U16: \
return CALCULATE_ROPE(TDATA, uint16_t); \
case INFINI_DTYPE_U32: \
return CALCULATE_ROPE(TDATA, uint32_t); \
case INFINI_DTYPE_U64: \
return CALCULATE_ROPE(TDATA, uint64_t); \
case INFINI_DTYPE_I8: \
return CALCULATE_ROPE(TDATA, int8_t); \
case INFINI_DTYPE_I16: \
return CALCULATE_ROPE(TDATA, int16_t); \
case INFINI_DTYPE_I32: \
return CALCULATE_ROPE(TDATA, int32_t); \
case INFINI_DTYPE_I64: \
return CALCULATE_ROPE(TDATA, int64_t); \
default: \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) const {
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(cuda_bfloat16);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
case INFINI_DTYPE_F64:
ROPE_TYPE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
#undef ROPE_TYPE
#undef CALCULATE_ROPE
} // namespace op::rope::moore
src/infiniop/ops/rope/operator.cc
View file @
9c87dbb1
...
...
@@ -20,6 +20,9 @@
#ifdef ENABLE_KUNLUN_API
#include "kunlun/rope_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/rope_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateRoPEDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -51,6 +54,9 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
...
...
@@ -62,13 +68,6 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRoPEDescriptor
((
MusaHandle_t
)
handle
,
(
RoPEMusaDescriptor_t
*
)
desc_ptr
,
t
,
pos_ids
,
sin_table
,
cos_table
);
}
#endif
}
...
...
@@ -94,6 +93,9 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
...
...
@@ -105,11 +107,6 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetRoPEWorkspaceSize
((
RoPEMusaDescriptor_t
)
desc
,
size
);
}
#endif
}
...
...
@@ -144,6 +141,9 @@ __C infiniStatus_t infiniopRoPE(
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
...
...
@@ -155,12 +155,6 @@ __C infiniStatus_t infiniopRoPE(
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaRoPE
((
RoPEMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
t
,
pos_ids
,
sin_table
,
cos_table
,
stream
);
}
#endif
}
...
...
@@ -187,6 +181,9 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
...
...
@@ -198,11 +195,6 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaDestroyRoPEDescriptor
((
RoPEMusaDescriptor_t
)
desc
);
}
#endif
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment