Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
f9d16628
Unverified
Commit
f9d16628
authored
Sep 16, 2025
by
PanZezhong1725
Committed by
GitHub
Sep 16, 2025
Browse files
Merge pull request #429 from InfiniTensor/issue/428_merge_rope_and_rope_v2
Issue/428: Merge `rope_v2` into `rope`
parents
15ac0191
9f0ae734
Changes
34
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
228 additions
and
621 deletions
+228
-621
include/infiniop.h
include/infiniop.h
+0
-1
include/infiniop/ops/rope.h
include/infiniop/ops/rope.h
+9
-1
include/infiniop/ops/rope_v2.h
include/infiniop/ops/rope_v2.h
+0
-32
src/infiniop-test/src/ops/rope.cpp
src/infiniop-test/src/ops/rope.cpp
+11
-3
src/infiniop/ops/rope/ascend/rope_ascend.cc
src/infiniop/ops/rope/ascend/rope_ascend.cc
+7
-2
src/infiniop/ops/rope/bang/rope_bang.mlu
src/infiniop/ops/rope/bang/rope_bang.mlu
+7
-2
src/infiniop/ops/rope/cpu/rope_cpu.cc
src/infiniop/ops/rope/cpu/rope_cpu.cc
+5
-4
src/infiniop/ops/rope/cuda/kernel.cuh
src/infiniop/ops/rope/cuda/kernel.cuh
+54
-22
src/infiniop/ops/rope/kunlun/rope_kunlun.xpu
src/infiniop/ops/rope/kunlun/rope_kunlun.xpu
+7
-2
src/infiniop/ops/rope/metax/rope_metax.maca
src/infiniop/ops/rope/metax/rope_metax.maca
+16
-8
src/infiniop/ops/rope/moore/rope_kernel_moore.h
src/infiniop/ops/rope/moore/rope_kernel_moore.h
+66
-34
src/infiniop/ops/rope/moore/rope_moore.mu
src/infiniop/ops/rope/moore/rope_moore.mu
+16
-8
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
+16
-8
src/infiniop/ops/rope/operator.cc
src/infiniop/ops/rope/operator.cc
+4
-2
src/infiniop/ops/rope/rope.h
src/infiniop/ops/rope/rope.h
+10
-4
src/infiniop/ops/rope_v2/ascend/rope_ascend.cc
src/infiniop/ops/rope_v2/ascend/rope_ascend.cc
+0
-50
src/infiniop/ops/rope_v2/ascend/rope_ascend.h
src/infiniop/ops/rope_v2/ascend/rope_ascend.h
+0
-25
src/infiniop/ops/rope_v2/ascend/rope_ascend_kernel.cpp
src/infiniop/ops/rope_v2/ascend/rope_ascend_kernel.cpp
+0
-280
src/infiniop/ops/rope_v2/bang/rope_bang.h
src/infiniop/ops/rope_v2/bang/rope_bang.h
+0
-8
src/infiniop/ops/rope_v2/bang/rope_bang.mlu
src/infiniop/ops/rope_v2/bang/rope_bang.mlu
+0
-125
No files found.
include/infiniop.h
View file @
f9d16628
...
...
@@ -15,7 +15,6 @@
#include "infiniop/ops/relu.h"
#include "infiniop/ops/rms_norm.h"
#include "infiniop/ops/rope.h"
#include "infiniop/ops/rope_v2.h"
#include "infiniop/ops/softplus.h"
#include "infiniop/ops/sub.h"
#include "infiniop/ops/swiglu.h"
...
...
include/infiniop/ops/rope.h
View file @
f9d16628
...
...
@@ -3,6 +3,13 @@
#include "../operator_descriptor.h"
typedef
enum
{
INFINIOP_ROPE_ALGO_GPT_J
=
0
,
// GPT-J style RoPE algorithm (Interleave even and odd dimensions)
INFINIOP_ROPE_ALGO_GPT_NEOX
=
1
,
// GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
// Count
INFINIOP_ROPE_ALGO_COUNT
=
2
,
}
infiniopRoPEAlgo_t
;
typedef
struct
InfiniopDescriptor
*
infiniopRoPEDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateRoPEDescriptor
(
...
...
@@ -12,7 +19,8 @@ __C __export infiniStatus_t infiniopCreateRoPEDescriptor(
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
pos_ids
,
infiniopTensorDescriptor_t
sin_table
,
infiniopTensorDescriptor_t
cos_table
);
infiniopTensorDescriptor_t
cos_table
,
infiniopRoPEAlgo_t
algo
);
__C
__export
infiniStatus_t
infiniopGetRoPEWorkspaceSize
(
infiniopRoPEDescriptor_t
desc
,
size_t
*
size
);
...
...
include/infiniop/ops/rope_v2.h
deleted
100644 → 0
View file @
15ac0191
#ifndef __INFINIOP_ROPE_V2_API_H__
#define __INFINIOP_ROPE_V2_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopRoPEv2Descriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateRoPEv2Descriptor
(
infiniopHandle_t
handle
,
infiniopRoPEv2Descriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
pos_ids
,
infiniopTensorDescriptor_t
sin_table
,
infiniopTensorDescriptor_t
cos_table
);
__C
__export
infiniStatus_t
infiniopGetRoPEv2WorkspaceSize
(
infiniopRoPEv2Descriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopRoPEv2
(
infiniopRoPEv2Descriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
const
*
pos_ids
,
void
const
*
sin_table
,
void
const
*
cos_table
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyRoPEv2Descriptor
(
infiniopRoPEv2Descriptor_t
desc
);
#endif
src/infiniop-test/src/ops/rope.cpp
View file @
f9d16628
#include "infiniop/ops/rope.h"
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
...
...
@@ -6,6 +7,8 @@
namespace
infiniop_test
::
rope
{
struct
Test
::
Attributes
{
infiniopRoPEAlgo_t
algo
;
std
::
shared_ptr
<
Tensor
>
y
;
std
::
shared_ptr
<
Tensor
>
x
;
std
::
shared_ptr
<
Tensor
>
pos_ids
;
...
...
@@ -21,7 +24,7 @@ std::shared_ptr<Test> Test::build(
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"y"
)
==
tensors
.
end
()
if
(
!
check_names
(
attributes
,
Test
::
attribute_names
())
||
tensors
.
find
(
"y"
)
==
tensors
.
end
()
||
tensors
.
find
(
"x"
)
==
tensors
.
end
()
||
tensors
.
find
(
"pos_ids"
)
==
tensors
.
end
()
||
tensors
.
find
(
"sin_table"
)
==
tensors
.
end
()
...
...
@@ -30,6 +33,8 @@ std::shared_ptr<Test> Test::build(
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
algo
=
*
reinterpret_cast
<
infiniopRoPEAlgo_t
*>
(
attributes
[
"algo"
].
data
());
test
->
_attributes
->
y
=
tensors
[
"y"
];
test
->
_attributes
->
x
=
tensors
[
"x"
];
test
->
_attributes
->
pos_ids
=
tensors
[
"pos_ids"
];
...
...
@@ -43,6 +48,7 @@ std::shared_ptr<Test> Test::build(
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopRoPEDescriptor_t
op_desc
;
infiniopRoPEAlgo_t
algo
=
_attributes
->
algo
;
auto
y
=
_attributes
->
y
->
to
(
device
,
device_id
);
auto
x
=
_attributes
->
x
->
to
(
device
,
device_id
);
auto
pos_ids
=
_attributes
->
pos_ids
->
to
(
device
,
device_id
);
...
...
@@ -54,7 +60,8 @@ std::shared_ptr<infiniop_test::Result> Test::run(
x
->
desc
(),
pos_ids
->
desc
(),
sin_table
->
desc
(),
cos_table
->
desc
()),
cos_table
->
desc
(),
algo
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create op descriptor."
));
size_t
workspace_size
;
...
...
@@ -101,7 +108,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
return
{
"algo"
};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
...
...
@@ -120,6 +127,7 @@ std::string Test::toString() const {
oss
<<
"- pos_ids: "
<<
_attributes
->
pos_ids
->
info
()
<<
std
::
endl
;
oss
<<
"- sin_table: "
<<
_attributes
->
sin_table
->
info
()
<<
std
::
endl
;
oss
<<
"- cos_table: "
<<
_attributes
->
cos_table
->
info
()
<<
std
::
endl
;
oss
<<
"- algo: "
<<
_attributes
->
algo
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
...
...
src/infiniop/ops/rope/ascend/rope_ascend.cc
View file @
f9d16628
...
...
@@ -13,11 +13,16 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
infiniopTensorDescriptor_t
cos_desc
,
infiniopRoPEAlgo_t
algo
)
{
auto
handle_ascned
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
auto
result
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
auto
result
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
,
algo
);
CHECK_RESULT
(
result
);
if
(
algo
!=
INFINIOP_ROPE_ALGO_GPT_J
)
{
return
INFINI_STATUS_NOT_IMPLEMENTED
;
}
size_t
workspace_size
=
0
;
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
result
.
take
()),
workspace_size
,
nullptr
,
handle_ascned
->
device
,
handle_ascned
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
...
...
src/infiniop/ops/rope/bang/rope_bang.mlu
View file @
f9d16628
...
...
@@ -13,13 +13,18 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
infiniopTensorDescriptor_t cos_desc,
infiniopRoPEAlgo_t algo) {
auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc
, algo
);
CHECK_RESULT(info);
if (algo != INFINIOP_ROPE_ALGO_GPT_J) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
// Create descriptor
*desc_ptr = new Descriptor(
info.take(),
...
...
src/infiniop/ops/rope/cpu/rope_cpu.cc
View file @
f9d16628
...
...
@@ -12,11 +12,12 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
infiniopTensorDescriptor_t
cos_desc
,
infiniopRoPEAlgo_t
algo
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
info
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
auto
info
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
,
algo
);
CHECK_RESULT
(
info
);
// Create descriptor
...
...
@@ -46,8 +47,8 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
size_t
table_offset
=
pos_id
*
info
.
table_dim
;
for
(
size_t
i
=
0
;
i
<
info
.
table_dim
;
i
++
)
{
size_t
pos0
=
2
*
i
;
size_t
pos1
=
2
*
i
+
1
;
size_t
pos0
=
info
.
algo
==
infiniopRoPEAlgo_t
::
INFINIOP_ROPE_ALGO_GPT_J
?
2
*
i
:
i
;
size_t
pos1
=
info
.
algo
==
infiniopRoPEAlgo_t
::
INFINIOP_ROPE_ALGO_GPT_J
?
2
*
i
+
1
:
i
+
info
.
table_dim
;
if
constexpr
(
std
::
is_same
<
Tdata
,
fp16_t
>::
value
||
std
::
is_same
<
Tdata
,
bf16_t
>::
value
)
{
float
x0
=
utils
::
cast
<
float
>
(
x
[
x_offset
+
pos0
]),
...
...
src/infiniop/ops/rope/cuda/kernel.cuh
View file @
f9d16628
#ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
template
<
bool
IsGPTJ
,
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
__device__
void
ropeThreadPerItemBlock
(
Tdata
*
y_
,
const
Tdata
*
x_
,
...
...
@@ -22,28 +22,60 @@ __device__ void ropeThreadPerItemBlock(
for
(
size_t
i
=
threadIdx
.
x
;
i
<
table_dim
;
i
+=
blockDim
.
x
)
{
Tangle
sin__
=
sin_table
[
table_offset
+
i
],
cos__
=
cos_table
[
table_offset
+
i
];
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
x0
=
__low2bfloat16
(
x
);
Tangle
x1
=
__high2bfloat16
(
x
);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y
=
__floats2bfloat162_rn
(
y0
,
y1
);
if
constexpr
(
IsGPTJ
)
{
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
x0
=
__low2bfloat16
(
x
);
Tangle
x1
=
__high2bfloat16
(
x
);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y
=
__floats2bfloat162_rn
(
y0
,
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
}
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
size_t
pos0
=
i
;
size_t
pos1
=
i
+
table_dim
;
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
Tangle
x0
=
__half2float
(
x_
[
x_offset
+
pos0
]);
Tangle
x1
=
__half2float
(
x_
[
x_offset
+
pos1
]);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y_
[
y_offset
+
pos0
]
=
__float2half
(
y0
);
y_
[
y_offset
+
pos1
]
=
__float2half
(
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
Tangle
x0
=
__bfloat162float
(
x_
[
x_offset
+
pos0
]);
Tangle
x1
=
__bfloat162float
(
x_
[
x_offset
+
pos1
]);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y_
[
y_offset
+
pos0
]
=
__float2bfloat16
(
y0
);
y_
[
y_offset
+
pos1
]
=
__float2bfloat16
(
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
pos0
];
Tangle
x1
=
x_
[
x_offset
+
pos1
];
y_
[
y_offset
+
pos0
]
=
x0
*
cos__
-
x1
*
sin__
;
y_
[
y_offset
+
pos1
]
=
x0
*
sin__
+
x1
*
cos__
;
}
}
}
}
...
...
src/infiniop/ops/rope/kunlun/rope_kunlun.xpu
View file @
f9d16628
...
...
@@ -118,11 +118,16 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
infiniopTensorDescriptor_t cos_desc,
infiniopRoPEAlgo_t algo) {
auto result = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
auto result = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc
, algo
);
CHECK_RESULT(result);
if (algo != INFINIOP_ROPE_ALGO_GPT_J) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
// Create descriptor
*desc_ptr = new Descriptor(
result.take(),
...
...
src/infiniop/ops/rope/metax/rope_metax.maca
View file @
f9d16628
...
...
@@ -5,7 +5,7 @@
#include "../cuda/kernel.cuh"
template <typename Tdata, typename Tindex, typename Tangle>
template <
bool IsGPTJ,
typename Tdata, typename Tindex, typename Tangle>
INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
...
...
@@ -17,7 +17,7 @@ INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
ropeThreadPerItemBlock
<IsGPTJ>
(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
...
...
@@ -42,11 +42,12 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
infiniopTensorDescriptor_t cos_desc,
infiniopRoPEAlgo_t algo) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc
, algo
);
CHECK_RESULT(info);
// Create descriptor
...
...
@@ -72,10 +73,17 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
auto dimx = uint32_t(info.seqlen),
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
bool is_gpt_j = info.algo == infiniopRoPEAlgo_t::INFINIOP_ROPE_ALGO_GPT_J;
if (is_gpt_j) {
ropeThreadPerItemKernel<true><<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
} else {
ropeThreadPerItemKernel<false><<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
}
return INFINI_STATUS_SUCCESS;
}
...
...
src/infiniop/ops/rope/moore/rope_kernel_moore.h
View file @
f9d16628
...
...
@@ -8,7 +8,7 @@
* which ensuring code alignment across different hardware platforms.
*/
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
template
<
bool
IsGPTJ
,
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
__device__
void
ropeThreadPerItemBlock
(
Tdata
*
y_
,
const
Tdata
*
x_
,
...
...
@@ -29,40 +29,72 @@ __device__ void ropeThreadPerItemBlock(
for
(
size_t
i
=
threadIdx
.
x
;
i
<
table_dim
;
i
+=
blockDim
.
x
)
{
Tangle
sin__
=
sin_table
[
table_offset
+
i
],
cos__
=
cos_table
[
table_offset
+
i
];
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
/*
* The original code used CUDA-specific functions (__low2bfloat16, __high2bfloat16)
* to extract bfloat16 values from a packed variable.
*
* This code has been modified for the MUSA platform, which does not support
* these CUDA built-in functions. Instead, MUSA provides a different set of
* built-in functions (`__low2float`, `__high2float`) that directly convert
* the bfloat16 values to float.
*
* This change ensures cross-platform compatibility and resolves compilation errors.
*/
Tangle
x0
=
__low2float
(
x
);
Tangle
x1
=
__high2float
(
x
);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y
=
__floats2bfloat162_rn
(
y0
,
y1
);
if
constexpr
(
IsGPTJ
)
{
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
/*
* The original code used CUDA-specific functions (__low2bfloat16, __high2bfloat16)
* to extract bfloat16 values from a packed variable.
*
* This code has been modified for the MUSA platform, which does not support
* these CUDA built-in functions. Instead, MUSA provides a different set of
* built-in functions (`__low2float`, `__high2float`) that directly convert
* the bfloat16 values to float.
*
* This change ensures cross-platform compatibility and resolves compilation errors.
*/
Tangle
x0
=
__low2float
(
x
);
Tangle
x1
=
__high2float
(
x
);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y
=
__floats2bfloat162_rn
(
y0
,
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
}
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
size_t
pos0
=
i
;
size_t
pos1
=
i
+
table_dim
;
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
Tangle
x0
=
__half2float
(
x_
[
x_offset
+
pos0
]);
Tangle
x1
=
__half2float
(
x_
[
x_offset
+
pos1
]);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y_
[
y_offset
+
pos0
]
=
__float2half
(
y0
);
y_
[
y_offset
+
pos1
]
=
__float2half
(
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda_bfloat16
>::
value
)
{
Tangle
x0
=
__bfloat162float
(
x_
[
x_offset
+
pos0
]);
Tangle
x1
=
__bfloat162float
(
x_
[
x_offset
+
pos1
]);
Tangle
y0
=
x0
*
cos__
-
x1
*
sin__
;
Tangle
y1
=
x0
*
sin__
+
x1
*
cos__
;
y_
[
y_offset
+
pos0
]
=
__float2bfloat16
(
y0
);
y_
[
y_offset
+
pos1
]
=
__float2bfloat16
(
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
pos0
];
Tangle
x1
=
x_
[
x_offset
+
pos1
];
y_
[
y_offset
+
pos0
]
=
x0
*
cos__
-
x1
*
sin__
;
y_
[
y_offset
+
pos1
]
=
x0
*
sin__
+
x1
*
cos__
;
}
}
}
}
...
...
src/infiniop/ops/rope/moore/rope_moore.mu
View file @
f9d16628
...
...
@@ -5,7 +5,7 @@
#include "rope_kernel_moore.h"
template <typename Tdata, typename Tindex, typename Tangle>
template <
bool IsGPTJ,
typename Tdata, typename Tindex, typename Tangle>
INFINIOP_MOORE_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
...
...
@@ -17,7 +17,7 @@ INFINIOP_MOORE_KERNEL ropeThreadPerItemKernel(
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
ropeThreadPerItemBlock
<IsGPTJ>
(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
...
...
@@ -42,11 +42,12 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
infiniopTensorDescriptor_t cos_desc,
infiniopRoPEAlgo_t algo) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc
, algo
);
CHECK_RESULT(info);
// Create descriptor
...
...
@@ -72,10 +73,17 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
auto dimx = uint32_t(info.seqlen),
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
bool is_gpt_j = info.algo == infiniopRoPEAlgo_t::INFINIOP_ROPE_ALGO_GPT_J;
if (is_gpt_j) {
ropeThreadPerItemKernel<true><<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
} else {
ropeThreadPerItemKernel<false><<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
}
return INFINI_STATUS_SUCCESS;
}
...
...
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
View file @
f9d16628
...
...
@@ -5,7 +5,7 @@
#include "../cuda/kernel.cuh"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
template
<
bool
IsGPTJ
,
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropeThreadPerItemKernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
...
...
@@ -17,7 +17,7 @@ INFINIOP_CUDA_KERNEL ropeThreadPerItemKernel(
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
ropeThreadPerItemBlock
(
ropeThreadPerItemBlock
<
IsGPTJ
>
(
y_
,
x_
,
pos_ids
,
sin_table
,
cos_table
,
table_dim
,
...
...
@@ -42,11 +42,12 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
infiniopTensorDescriptor_t
cos_desc
,
infiniopRoPEAlgo_t
algo
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
info
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
auto
info
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
,
algo
);
CHECK_RESULT
(
info
);
// Create descriptor
...
...
@@ -72,10 +73,17 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
auto
dimx
=
uint32_t
(
info
.
seqlen
),
dimy
=
uint32_t
(
info
.
nhead
);
int
nthreads
=
std
::
max
(
int
(
info
.
table_dim
),
block_size
);
ropeThreadPerItemKernel
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
y
,
x
,
pos_ids
,
sin_table
,
cos_table
,
info
.
table_dim
,
info
.
y_stride_seqlen
,
info
.
y_stride_nhead
,
info
.
x_stride_seqlen
,
info
.
x_stride_nhead
);
bool
is_gpt_j
=
info
.
algo
==
infiniopRoPEAlgo_t
::
INFINIOP_ROPE_ALGO_GPT_J
;
if
(
is_gpt_j
)
{
ropeThreadPerItemKernel
<
true
><<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
y
,
x
,
pos_ids
,
sin_table
,
cos_table
,
info
.
table_dim
,
info
.
y_stride_seqlen
,
info
.
y_stride_nhead
,
info
.
x_stride_seqlen
,
info
.
x_stride_nhead
);
}
else
{
ropeThreadPerItemKernel
<
false
><<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
y
,
x
,
pos_ids
,
sin_table
,
cos_table
,
info
.
table_dim
,
info
.
y_stride_seqlen
,
info
.
y_stride_nhead
,
info
.
x_stride_seqlen
,
info
.
x_stride_nhead
);
}
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/rope/operator.cc
View file @
f9d16628
...
...
@@ -31,7 +31,8 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
pos_ids
,
infiniopTensorDescriptor_t
sin_table
,
infiniopTensorDescriptor_t
cos_table
)
{
infiniopTensorDescriptor_t
cos_table
,
infiniopRoPEAlgo_t
algo
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
...
...
@@ -42,7 +43,8 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
x, \
pos_ids, \
sin_table, \
cos_table)
cos_table, \
algo)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
...
...
src/infiniop/ops/rope/rope.h
View file @
f9d16628
...
...
@@ -4,6 +4,7 @@
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "infiniop/ops/rope.h"
#define DESCRIPTOR(NAMESPACE) \
\
...
...
@@ -37,7 +38,8 @@
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t pos_desc, \
infiniopTensorDescriptor_t sin_desc, \
infiniopTensorDescriptor_t cos_desc); \
infiniopTensorDescriptor_t cos_desc, \
infiniopRoPEAlgo_t algo); \
\
infiniStatus_t calculate( \
void *workspace, \
...
...
@@ -63,15 +65,18 @@ public:
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
;
infiniopRoPEAlgo_t
algo
;
static
utils
::
Result
<
RoPEInfo
>
createRoPEInfo
(
static
utils
::
Result
<
RoPEInfo
>
createRoPEInfo
(
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
infiniopTensorDescriptor_t
cos_desc
,
infiniopRoPEAlgo_t
algo
)
{
CHECK_OR_RETURN
(
y_desc
!=
nullptr
&&
pos_desc
!=
nullptr
&&
sin_desc
!=
nullptr
&&
cos_desc
!=
nullptr
,
y_desc
!=
nullptr
&&
pos_desc
!=
nullptr
&&
sin_desc
!=
nullptr
&&
cos_desc
!=
nullptr
&&
algo
<
infiniopRoPEAlgo_t
::
INFINIOP_ROPE_ALGO_COUNT
,
INFINI_STATUS_NULL_POINTER
);
const
infiniDtype_t
data_type
=
y_desc
->
dtype
();
...
...
@@ -118,6 +123,7 @@ public:
y_desc
->
stride
(
1
),
x_desc
->
stride
(
0
),
x_desc
->
stride
(
1
),
algo
,
});
}
};
...
...
src/infiniop/ops/rope_v2/ascend/rope_ascend.cc
deleted
100644 → 0
View file @
15ac0191
#include "rope_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace
op
::
rope
::
ascend
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
auto
handle_ascned
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
auto
result
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
CHECK_RESULT
(
result
);
size_t
workspace_size
=
0
;
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
result
.
take
()),
workspace_size
,
nullptr
,
handle_ascned
->
device
,
handle_ascned
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
pos_ids
,
const
void
*
sin_table
,
const
void
*
cos_table
,
void
*
stream
)
const
{
CHECK_DTYPE
(
_info
.
data_type
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F16
);
auto
data_type
=
_info
.
data_type
;
auto
pos_type
=
_info
.
pos_type
;
auto
seq_len
=
_info
.
seqlen
;
auto
nhead
=
_info
.
nhead
;
auto
dhead
=
_info
.
dhead
;
auto
y_stride_seqlen
=
_info
.
y_stride_seqlen
;
auto
y_stride_nhead
=
_info
.
y_stride_nhead
;
auto
x_stride_seqlen
=
_info
.
x_stride_seqlen
;
auto
x_stride_nhead
=
_info
.
x_stride_nhead
;
return
rope_kernel_launch
(
y
,
(
void
*
)
x
,
(
void
*
)
pos_ids
,
(
void
*
)
sin_table
,
(
void
*
)
cos_table
,
seq_len
,
nhead
,
dhead
,
data_type
,
pos_type
,
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
,
stream
);
}
}
// namespace op::rope::ascend
src/infiniop/ops/rope_v2/ascend/rope_ascend.h
deleted
100644 → 0
View file @
15ac0191
#ifndef __ACLNN_ROPE_H__
#define __ACLNN_ROPE_H__
#include "../rope.h"
extern
"C"
infiniStatus_t
rope_kernel_launch
(
void
*
y
,
void
*
x
,
void
*
pos
,
void
*
sin
,
void
*
cos
,
size_t
seq_len
,
size_t
nhead
,
size_t
dhead
,
infiniDtype_t
data_type
,
infiniDtype_t
pos_type
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
,
void
*
stream
);
DESCRIPTOR
(
ascend
)
#endif // __ACLNN_ROPE_H__
src/infiniop/ops/rope_v2/ascend/rope_ascend_kernel.cpp
deleted
100644 → 0
View file @
15ac0191
#include "../../../devices/ascend/ascend_kernel_common.h"
using
namespace
AscendC
;
template
<
typename
T
,
typename
U
>
class
RoPEKernel
{
public:
__aicore__
inline
RoPEKernel
()
{}
// Init op
// pos position vector
// x input tensor
// y output tensor
// tensor shape [nt, nh, dh]
// make block_num = nh, tile_len = dh
__aicore__
inline
void
init
(
GM_ADDR
y
,
GM_ADDR
x
,
GM_ADDR
pos
,
GM_ADDR
sin
,
GM_ADDR
cos
,
size_t
dh
,
ptrdiff_t
st_ynt
,
ptrdiff_t
st_ynh
,
ptrdiff_t
st_xnt
,
ptrdiff_t
st_xnh
);
__aicore__
inline
void
process
(
size_t
seq_len
);
private:
// Copy a tile into UB
__aicore__
inline
void
copyIn
(
size_t
i
);
__aicore__
inline
void
compute
(
size_t
i
);
__aicore__
inline
void
copyOut
(
size_t
i
);
private:
TPipe
pipe
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_in_que
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_sin_que
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_cos_que
;
TQue
<
QuePosition
::
VECOUT
,
BUFFER_NUM
>
_out_que
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf1
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf2
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf1
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf2
;
GlobalTensor
<
T
>
_x_gm
,
_y_gm
;
GlobalTensor
<
U
>
_p_gm
;
GlobalTensor
<
T
>
_sin_gm
;
GlobalTensor
<
T
>
_cos_gm
;
size_t
_block_idx
;
size_t
_tile_len
;
size_t
_copy_len
;
size_t
_half_copy_len
;
// stridey[_st_ynt, _st_ynh, 1]
ptrdiff_t
_st_ynt
;
ptrdiff_t
_st_ynh
;
// stridex[_st_xnt, _st_xnh, 1]
ptrdiff_t
_st_xnt
;
ptrdiff_t
_st_xnh
;
};
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
init
(
GM_ADDR
y
,
GM_ADDR
x
,
GM_ADDR
pos
,
GM_ADDR
sin
,
GM_ADDR
cos
,
size_t
dh
,
ptrdiff_t
st_ynt
,
ptrdiff_t
st_ynh
,
ptrdiff_t
st_xnt
,
ptrdiff_t
st_xnh
)
{
this
->
_tile_len
=
dh
;
this
->
_st_ynt
=
st_ynt
;
this
->
_st_ynh
=
st_ynh
;
this
->
_st_xnt
=
st_xnt
;
this
->
_st_xnh
=
st_xnh
;
_copy_len
=
alignTileLen
<
T
>
(
dh
,
BYTE_ALIGN
);
_half_copy_len
=
alignTileLen
<
T
>
(
dh
,
BYTE_ALIGN
);
_block_idx
=
GetBlockIdx
();
// Init global buffer
_x_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
x
);
_p_gm
.
SetGlobalBuffer
((
__gm__
U
*
)
pos
);
_sin_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
sin
);
_cos_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
cos
);
_y_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
y
);
// Init Queue buffer
pipe
.
InitBuffer
(
_in_que
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_out_que
,
BUFFER_NUM
,
_tile_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_sin_que
,
BUFFER_NUM
,
_half_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_cos_que
,
BUFFER_NUM
,
_half_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf1
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf2
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf1
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf2
,
_tile_len
/
2
*
sizeof
(
T
));
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
copyIn
(
size_t
i
)
{
LocalTensor
<
T
>
input_ub
=
_in_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
sin_ub
=
_sin_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
cos_ub
=
_cos_que
.
AllocTensor
<
T
>
();
// Get idx of current tile in total input
auto
idx
=
i
*
_st_xnt
+
_block_idx
*
_st_xnh
;
// Copy tile current tile into UB
DataCopy
(
input_ub
,
_x_gm
[
idx
],
_copy_len
);
// Copy sin cos tile
auto
pos_idx
=
_p_gm
(
i
);
DataCopy
(
sin_ub
,
_sin_gm
[
pos_idx
*
_tile_len
/
2
],
_half_copy_len
);
DataCopy
(
cos_ub
,
_cos_gm
[
pos_idx
*
_tile_len
/
2
],
_half_copy_len
);
// Push in operands
_in_que
.
EnQue
(
input_ub
);
_sin_que
.
EnQue
(
sin_ub
);
_cos_que
.
EnQue
(
cos_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
compute
(
size_t
i
)
{
LocalTensor
<
T
>
input_ub
=
_in_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
sin_ub
=
_sin_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
cos_ub
=
_cos_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
output_ub
=
_out_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
tmp_odd
=
_tmp_odd_buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even
=
_tmp_even_buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_odd1
=
_tmp_odd_buf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_odd2
=
_tmp_odd_buf2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even1
=
_tmp_even_buf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even2
=
_tmp_even_buf2
.
Get
<
T
>
();
// separate odd and even bit elements
uint64_t
rsvdCnt
=
0
;
GatherMaskParams
gMaskParams
=
{
1
,
static_cast
<
uint16_t
>
((
_tile_len
*
sizeof
(
T
)
+
255
)
/
256
),
// no more than 256(<=255)
8
,
8
,
};
GatherMask
<
T
>
(
tmp_odd
,
input_ub
,
1
,
false
,
0
,
gMaskParams
,
rsvdCnt
);
GatherMask
<
T
>
(
tmp_even
,
input_ub
,
2
,
false
,
0
,
gMaskParams
,
rsvdCnt
);
PipeBarrier
<
PIPE_V
>
();
// compute odd bit elements
// y_odd = x_odd * cos - x_even * sin
Mul
<
T
>
(
tmp_odd1
,
tmp_odd
,
cos_ub
,
_tile_len
/
2
);
Mul
<
T
>
(
tmp_odd2
,
tmp_even
,
sin_ub
,
_tile_len
/
2
);
PipeBarrier
<
PIPE_V
>
();
Sub
<
T
>
(
tmp_odd1
,
tmp_odd1
,
tmp_odd2
,
_tile_len
/
2
);
// compute even bit elements
// y_even = x_odd * sin + x_even * cos
Mul
<
T
>
(
tmp_even1
,
tmp_odd
,
sin_ub
,
_tile_len
/
2
);
Mul
<
T
>
(
tmp_even2
,
tmp_even
,
cos_ub
,
_tile_len
/
2
);
PipeBarrier
<
PIPE_V
>
();
Add
<
T
>
(
tmp_even1
,
tmp_even1
,
tmp_even2
,
_tile_len
/
2
);
// combine odd and even bit elements
for
(
uint32_t
j
=
0
;
j
<
_tile_len
/
2
;
j
+=
1
)
{
output_ub
(
j
*
2
)
=
tmp_odd1
(
j
);
output_ub
(
j
*
2
+
1
)
=
tmp_even1
(
j
);
}
_out_que
.
EnQue
<
T
>
(
output_ub
);
_in_que
.
FreeTensor
(
input_ub
);
_sin_que
.
FreeTensor
(
sin_ub
);
_cos_que
.
FreeTensor
(
cos_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
copyOut
(
size_t
i
)
{
LocalTensor
<
T
>
output_ub
=
_out_que
.
DeQue
<
T
>
();
auto
idy
=
i
*
_st_ynt
+
_block_idx
*
_st_ynh
;
DataCopyExtParams
params
=
{
1
,
static_cast
<
uint32_t
>
(
_tile_len
*
sizeof
(
T
)),
0
,
0
,
0
};
DataCopyPad
(
_y_gm
[
idy
],
output_ub
,
params
);
_out_que
.
FreeTensor
(
output_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
process
(
size_t
seq_len
)
{
for
(
size_t
i
=
0
;
i
<
seq_len
;
++
i
)
{
copyIn
(
i
);
compute
(
i
);
copyOut
(
i
);
}
}
#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead, \
y_stride_seqlen, y_stride_nhead, \
x_stride_seqlen, x_stride_nhead
#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
case POS_TYPE_ENUM: { \
RoPEKernel<TYPE, POS_T> op; \
op.init(ROPE_KERNEL_INIT_ARGS); \
op.process(seq_len); \
break; \
}
#define ROPE_KERNEL(TYPE, POSTYPE) \
switch (POSTYPE) { \
CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t) \
CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t) \
CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t) \
CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t) \
CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t) \
CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
default: \
break; \
}
#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR y, \
GM_ADDR x, \
GM_ADDR pos, \
GM_ADDR sin, \
GM_ADDR cos, \
size_t seq_len, \
size_t dhead, \
ptrdiff_t y_stride_seqlen, \
ptrdiff_t y_stride_nhead, \
ptrdiff_t x_stride_seqlen, \
ptrdiff_t x_stride_nhead, \
int32_t pos_type) { \
ROPE_KERNEL(TYPE, pos_type) \
}
DEFINE_ROPE_KERNEL
(
rope_kernel_float
,
float
)
DEFINE_ROPE_KERNEL
(
rope_kernel_half
,
half
)
#undef DEFINE_ROPE_KERNEL
#undef ROPE_KERNEL
#undef CASE_POSTYPE
#undef ROPE_KERNEL_INIT_ARGS
extern
"C"
infiniStatus_t
rope_kernel_launch
(
void
*
y
,
void
*
x
,
void
*
pos
,
void
*
sin
,
void
*
cos
,
size_t
seq_len
,
size_t
nhead
,
size_t
dhead
,
infiniDtype_t
dtype
,
infiniDtype_t
pos_type
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
,
void
*
stream
)
{
#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
seq_len, \
dhead, \
y_stride_seqlen, \
y_stride_nhead, \
x_stride_seqlen, \
x_stride_nhead, \
pos_type); \
return INFINI_STATUS_SUCCESS;
switch
(
dtype
)
{
LAUNCH_ROPE_KERNEL
(
INFINI_DTYPE_F16
,
rope_kernel_half
)
LAUNCH_ROPE_KERNEL
(
INFINI_DTYPE_F32
,
rope_kernel_float
)
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
src/infiniop/ops/rope_v2/bang/rope_bang.h
deleted
100644 → 0
View file @
15ac0191
#ifndef __INFINIOP_ROPE_BANG_H__
#define __INFINIOP_ROPE_BANG_H__
#include "../rope.h"
DESCRIPTOR
(
bang
)
#endif // __INFINIOP_ROPE_BANG_H__
src/infiniop/ops/rope_v2/bang/rope_bang.mlu
deleted
100644 → 0
View file @
15ac0191
#include "../../../devices/bang/common_bang.h"
#include "rope_bang.h"
#include "rope_bang_kernel.mlu"
namespace op::rope::bang {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(info);
// Create descriptor
*desc_ptr = new Descriptor(
info.take(),
0,
nullptr,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tindex>
infiniStatus_t calculateRoPE(const RoPEInfo &info,
Tdata *y,
const Tdata *x,
const Tindex *pos_ids,
const Tdata *sin_table,
const Tdata *cos_table,
cnrtQueue_t queue) {
auto dimx = uint32_t(info.seqlen);
auto dimy = uint32_t(info.nhead);
auto table_dim = uint32_t(info.table_dim);
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
// Configure kernel launch parameters
k_dim.x = 4;
k_dim.y = 1;
k_dim.z = 1;
k_type = CNRT_FUNC_TYPE_UNION1;
// Launch kernel
ropeKernel<<<k_dim, k_type, queue>>>(
y, x, pos_ids, sin_table, cos_table,
dimx, dimy, table_dim,
info.y_stride_seqlen, info.y_stride_nhead,
info.x_stride_seqlen, info.x_stride_nhead);
cnrtQueueSync(queue);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE_ROPE(TDATA, TINDEX) \
calculateRoPE(_info, \
(TDATA *)y, \
(const TDATA *)x, \
(const TINDEX *)pos_ids, \
(const TDATA *)sin_table, \
(const TDATA *)cos_table, \
(cnrtQueue_t)stream)
#define ROPE_TYPE(TDATA) \
switch (_info.pos_type) { \
case INFINI_DTYPE_U8: \
return CALCULATE_ROPE(TDATA, uint8_t); \
case INFINI_DTYPE_U16: \
return CALCULATE_ROPE(TDATA, uint16_t); \
case INFINI_DTYPE_U32: \
return CALCULATE_ROPE(TDATA, uint32_t); \
case INFINI_DTYPE_U64: \
return CALCULATE_ROPE(TDATA, uint64_t); \
case INFINI_DTYPE_I8: \
return CALCULATE_ROPE(TDATA, int8_t); \
case INFINI_DTYPE_I16: \
return CALCULATE_ROPE(TDATA, int16_t); \
case INFINI_DTYPE_I32: \
return CALCULATE_ROPE(TDATA, int32_t); \
case INFINI_DTYPE_I64: \
return CALCULATE_ROPE(TDATA, int64_t); \
default: \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) const {
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(bfloat16_t);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
#undef ROPE_TYPE
#undef CALCULATE_ROPE
} // namespace op::rope::bang
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment