Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
ce2c4813
Commit
ce2c4813
authored
Jun 03, 2025
by
Catheriany
Browse files
issue/228: clip算子更新
parents
6bb801f6
6ca0e313
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
939 additions
and
94 deletions
+939
-94
include/infiniop.h
include/infiniop.h
+1
-0
include/infiniop/ops/clip.h
include/infiniop/ops/clip.h
+28
-0
include/infiniop/ops/mul.h
include/infiniop/ops/mul.h
+5
-5
src/infiniop-test/include/ops.hpp
src/infiniop-test/include/ops.hpp
+2
-0
src/infiniop-test/src/ops/clip.cpp
src/infiniop-test/src/ops/clip.cpp
+120
-0
src/infiniop/devices/ascend/CMakeLists.txt
src/infiniop/devices/ascend/CMakeLists.txt
+1
-2
src/infiniop/devices/ascend/ascend_kernel_common.h
src/infiniop/devices/ascend/ascend_kernel_common.h
+12
-3
src/infiniop/ops/attention/operator.cc
src/infiniop/ops/attention/operator.cc
+69
-64
src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
...nfiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
+1
-1
src/infiniop/ops/clip/cpu/clip_cpu.cc
src/infiniop/ops/clip/cpu/clip_cpu.cc
+54
-0
src/infiniop/ops/clip/cpu/clip_cpu.h
src/infiniop/ops/clip/cpu/clip_cpu.h
+23
-0
src/infiniop/ops/clip/cuda/clip_cuda.cu
src/infiniop/ops/clip/cuda/clip_cuda.cu
+59
-0
src/infiniop/ops/clip/cuda/clip_cuda.cuh
src/infiniop/ops/clip/cuda/clip_cuda.cuh
+9
-0
src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
+30
-0
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+118
-0
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+46
-15
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
+6
-4
src/infiniop/ops/rope/ascend/rope_ascend.cc
src/infiniop/ops/rope/ascend/rope_ascend.cc
+50
-0
src/infiniop/ops/rope/ascend/rope_ascend.h
src/infiniop/ops/rope/ascend/rope_ascend.h
+25
-0
src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
+280
-0
No files found.
include/infiniop.h
View file @
ce2c4813
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "infiniop/ops/attention.h"
#include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/gemm.h"
...
...
include/infiniop/ops/clip.h
0 → 100644
View file @
ce2c4813
#ifndef __INFINIOP_CLIP_API_H__
#define __INFINIOP_CLIP_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopClipDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateClipDescriptor
(
infiniopHandle_t
handle
,
infiniopClipDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
min_val
,
infiniopTensorDescriptor_t
max_val
);
__C
__export
infiniStatus_t
infiniopGetClipWorkspaceSize
(
infiniopClipDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopClip
(
infiniopClipDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
min_val
,
const
void
*
max_val
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyClipDescriptor
(
infiniopClipDescriptor_t
desc
);
#endif
include/infiniop/ops/mul.h
View file @
ce2c4813
...
@@ -6,10 +6,10 @@
...
@@ -6,10 +6,10 @@
typedef
struct
InfiniopDescriptor
*
infiniopMulDescriptor_t
;
typedef
struct
InfiniopDescriptor
*
infiniopMulDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
__C
__export
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
infiniopMulDescriptor_t
*
desc_ptr
,
infiniopMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c
,
infiniopTensorDescriptor_t
c
,
infiniopTensorDescriptor_t
a
,
infiniopTensorDescriptor_t
a
,
infiniopTensorDescriptor_t
b
);
infiniopTensorDescriptor_t
b
);
__C
__export
infiniStatus_t
infiniopGetMulWorkspaceSize
(
infiniopMulDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopGetMulWorkspaceSize
(
infiniopMulDescriptor_t
desc
,
size_t
*
size
);
...
@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
...
@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
const
void
*
a
,
const
void
*
a
,
const
void
*
b
,
const
void
*
b
,
void
*
stream
);
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyMulDescriptor
(
infiniopMulDescriptor_t
desc
);
__C
__export
infiniStatus_t
infiniopDestroyMulDescriptor
(
infiniopMulDescriptor_t
desc
);
#endif
#endif
src/infiniop-test/include/ops.hpp
View file @
ce2c4813
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
DECLARE_INFINIOP_TEST
(
gemm
)
DECLARE_INFINIOP_TEST
(
gemm
)
DECLARE_INFINIOP_TEST
(
random_sample
)
DECLARE_INFINIOP_TEST
(
random_sample
)
DECLARE_INFINIOP_TEST
(
mul
)
DECLARE_INFINIOP_TEST
(
mul
)
DECLARE_INFINIOP_TEST
(
clip
)
DECLARE_INFINIOP_TEST
(
swiglu
)
DECLARE_INFINIOP_TEST
(
swiglu
)
DECLARE_INFINIOP_TEST
(
add
)
DECLARE_INFINIOP_TEST
(
add
)
...
@@ -30,6 +31,7 @@ DECLARE_INFINIOP_TEST(add)
...
@@ -30,6 +31,7 @@ DECLARE_INFINIOP_TEST(add)
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(add) \
REGISTER_INFINIOP_TEST(add) \
REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(clip) \
REGISTER_INFINIOP_TEST(swiglu) \
REGISTER_INFINIOP_TEST(swiglu) \
}
}
...
...
src/infiniop-test/src/ops/clip.cpp
0 → 100644
View file @
ce2c4813
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
clip
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
x
;
std
::
shared_ptr
<
Tensor
>
min_val
;
std
::
shared_ptr
<
Tensor
>
max_val
;
std
::
shared_ptr
<
Tensor
>
y
;
std
::
shared_ptr
<
Tensor
>
ans
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"x"
)
==
tensors
.
end
()
||
tensors
.
find
(
"min_val"
)
==
tensors
.
end
()
||
tensors
.
find
(
"max_val"
)
==
tensors
.
end
()
||
tensors
.
find
(
"y"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
x
=
tensors
[
"x"
];
test
->
_attributes
->
min_val
=
tensors
[
"min_val"
];
test
->
_attributes
->
max_val
=
tensors
[
"max_val"
];
test
->
_attributes
->
y
=
tensors
[
"y"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopClipDescriptor_t
op_desc
;
auto
x
=
_attributes
->
x
->
to
(
device
,
device_id
);
auto
min_val
=
_attributes
->
min_val
->
to
(
device
,
device_id
);
auto
max_val
=
_attributes
->
max_val
->
to
(
device
,
device_id
);
auto
y
=
_attributes
->
y
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateClipDescriptor
(
handle
,
&
op_desc
,
y
->
desc
(),
x
->
desc
(),
min_val
->
desc
(),
max_val
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create clip descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetClipWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopClip
(
op_desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
min_val
->
data
(),
max_val
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
y
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopClip
(
op_desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
min_val
->
data
(),
max_val
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
infiniopDestroyClipDescriptor
(
op_desc
);
infinirtFree
(
workspace
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"x"
,
"min_val"
,
"max_val"
,
"y"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"y"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- x: "
<<
_attributes
->
x
->
info
()
<<
std
::
endl
;
oss
<<
"- min_val: "
<<
_attributes
->
min_val
->
info
()
<<
std
::
endl
;
oss
<<
"- max_val: "
<<
_attributes
->
max_val
->
info
()
<<
std
::
endl
;
oss
<<
"- y: "
<<
_attributes
->
y
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::clip
src/infiniop/devices/ascend/CMakeLists.txt
View file @
ce2c4813
...
@@ -23,10 +23,9 @@ include_directories(
...
@@ -23,10 +23,9 @@ include_directories(
${
CMAKE_SOURCE_DIR
}
/../../../../include/infiniop/
${
CMAKE_SOURCE_DIR
}
/../../../../include/infiniop/
)
)
ascendc_library
(
ascend_kernels STATIC
ascendc_library
(
ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
#
../../ops/ro
tary_embedding/ascend/rotary_embedding
_kernel.cpp
../../ops/ro
pe/ascend/rope_ascend
_kernel.cpp
# ../../ops/random_sample/ascend/random_sample_kernel.cpp
# ../../ops/random_sample/ascend/random_sample_kernel.cpp
)
)
src/infiniop/devices/ascend/ascend_kernel_common.h
View file @
ce2c4813
...
@@ -4,8 +4,17 @@
...
@@ -4,8 +4,17 @@
#include "../../../../include/infinicore.h"
#include "../../../../include/infinicore.h"
#include "kernel_operator.h"
#include "kernel_operator.h"
constexpr
int32_t
BLOCK_NUM
=
8
;
constexpr
size_t
BLOCK_NUM
=
8
;
constexpr
int32_t
BUFFER_NUM
=
2
;
constexpr
size_t
BUFFER_NUM
=
2
;
constexpr
int32_t
BYTE_ALIGN
=
32
;
constexpr
size_t
BYTE_ALIGN
=
32
;
template
<
typename
T
>
__aicore__
inline
size_t
alignTileLen
(
size_t
tile_len
,
size_t
byte_align
)
{
size_t
bytes
=
tile_len
*
sizeof
(
T
);
size_t
aligned_bytes
=
(
bytes
%
byte_align
==
0
)
?
bytes
:
(
bytes
+
(
byte_align
-
bytes
%
byte_align
));
return
aligned_bytes
/
sizeof
(
T
);
}
#endif
#endif
src/infiniop/ops/attention/operator.cc
View file @
ce2c4813
...
@@ -20,15 +20,14 @@ struct InfiniopAttentionDescriptor {
...
@@ -20,15 +20,14 @@ struct InfiniopAttentionDescriptor {
infiniopGemmDescriptor_t
matmul_desc1
;
infiniopGemmDescriptor_t
matmul_desc1
;
infiniopGemmDescriptor_t
matmul_desc2
;
infiniopGemmDescriptor_t
matmul_desc2
;
infiniopCausalSoftmaxDescriptor_t
softmax_desc
;
infiniopCausalSoftmaxDescriptor_t
softmax_desc
;
uint64_t
workspace_size
;
size_t
workspace_size
;
uint64_t
rearranged_q_size
;
size_t
op_workspace_offset
;
uint64_t
matmul1_workspace_size
;
size_t
op_workspace_size
;
uint64_t
matmul1_tensor_size
;
size_t
q_cont_offset
;
uint64_t
matmul2_workspace_size
;
size_t
att_score_offset
;
uint64_t
matmul2_tensor_size
;
size_t
att_val_offset
;
uint64_t
softmax_workspace_size
;
size_t
k_cache_offset
;
uint64_t
k_cache_offset
;
size_t
v_cache_offset
;
uint64_t
v_cache_offset
;
float
qk_alpha
;
float
qk_alpha
;
};
};
...
@@ -40,7 +39,7 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -40,7 +39,7 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
infiniopTensorDescriptor_t
v_desc
,
infiniopTensorDescriptor_t
v_desc
,
infiniopTensorDescriptor_t
k_cache_desc
,
infiniopTensorDescriptor_t
k_cache_desc
,
infiniopTensorDescriptor_t
v_cache_desc
,
infiniopTensorDescriptor_t
v_cache_desc
,
uint64
_t
pos
)
{
size
_t
pos
)
{
if
(
out_desc
->
ndim
()
!=
3
||
q_desc
->
ndim
()
!=
3
||
k_desc
->
ndim
()
!=
3
||
v_desc
->
ndim
()
!=
3
||
k_cache_desc
->
ndim
()
!=
3
||
v_cache_desc
->
ndim
()
!=
3
)
{
if
(
out_desc
->
ndim
()
!=
3
||
q_desc
->
ndim
()
!=
3
||
k_desc
->
ndim
()
!=
3
||
v_desc
->
ndim
()
!=
3
||
k_cache_desc
->
ndim
()
!=
3
||
v_cache_desc
->
ndim
()
!=
3
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
}
...
@@ -53,13 +52,14 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -53,13 +52,14 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
}
uint64_t
n_q_head
=
q_desc
->
shape
()[
0
];
size_t
n_q_head
=
q_desc
->
shape
()[
0
];
uint64_t
seq_len
=
q_desc
->
shape
()[
1
];
size_t
seq_len
=
q_desc
->
shape
()[
1
];
uint64_t
head_dim
=
q_desc
->
shape
()[
2
];
size_t
head_dim
=
q_desc
->
shape
()[
2
];
uint64_t
hidden_size
=
n_q_head
*
head_dim
;
size_t
hidden_size
=
n_q_head
*
head_dim
;
uint64_t
n_kv_head
=
k_desc
->
shape
()[
0
];
size_t
n_kv_head
=
k_desc
->
shape
()[
0
];
uint64_t
total_seq_len
=
seq_len
+
pos
;
size_t
total_seq_len
=
seq_len
+
pos
;
uint64_t
n_group
=
n_q_head
/
n_kv_head
;
size_t
n_group
=
n_q_head
/
n_kv_head
;
size_t
alignment
=
256
;
if
(
out_desc
->
shape
()[
0
]
!=
seq_len
||
out_desc
->
shape
()[
1
]
!=
n_q_head
||
out_desc
->
shape
()[
2
]
!=
head_dim
)
{
if
(
out_desc
->
shape
()[
0
]
!=
seq_len
||
out_desc
->
shape
()[
1
]
!=
n_q_head
||
out_desc
->
shape
()[
2
]
!=
head_dim
)
{
return
INFINI_STATUS_BAD_PARAM
;
return
INFINI_STATUS_BAD_PARAM
;
...
@@ -98,12 +98,12 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -98,12 +98,12 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
CHECK_STATUS
(
infiniopCreateRearrangeDescriptor
(
handle
,
&
rearrange_desc_v
,
dst_v_desc
,
v_desc
));
CHECK_STATUS
(
infiniopCreateRearrangeDescriptor
(
handle
,
&
rearrange_desc_v
,
dst_v_desc
,
v_desc
));
infiniopRearrangeDescriptor_t
rearrange_desc_q
=
nullptr
;
infiniopRearrangeDescriptor_t
rearrange_desc_q
=
nullptr
;
uint64_t
rearranged_q
_size
=
0
;
size_t
q_cont
_size
=
0
;
infiniopTensorDescriptor_t
rearranged_q_desc
;
infiniopTensorDescriptor_t
rearranged_q_desc
;
// Rearrange q into contiguous
// Rearrange q into contiguous
if
(
!
q_desc
->
isContiguous
(
0
,
1
))
{
if
(
!
q_desc
->
isContiguous
(
0
,
1
))
{
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
rearranged_q_desc
,
3
,
q_desc
->
shape
().
data
(),
nullptr
,
q_desc
->
dtype
()));
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
rearranged_q_desc
,
3
,
q_desc
->
shape
().
data
(),
nullptr
,
q_desc
->
dtype
()));
rearranged_q_size
=
rearranged_q_desc
->
numel
()
*
infiniSizeOf
(
rearranged_q_desc
->
dtype
());
q_cont_size
=
utils
::
align
(
rearranged_q_desc
->
numel
()
*
infiniSizeOf
(
rearranged_q_desc
->
dtype
())
,
alignment
)
;
rearrange_desc_q
=
new
InfiniopDescriptor
;
rearrange_desc_q
=
new
InfiniopDescriptor
;
CHECK_STATUS
(
infiniopCreateRearrangeDescriptor
(
handle
,
&
rearrange_desc_q
,
rearranged_q_desc
,
q_desc
));
CHECK_STATUS
(
infiniopCreateRearrangeDescriptor
(
handle
,
&
rearrange_desc_q
,
rearranged_q_desc
,
q_desc
));
}
}
...
@@ -116,12 +116,12 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -116,12 +116,12 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
TRANSFORM_TENSOR_DESC
(
reshaped_q_desc
,
dimMerge
(
1
,
2
));
TRANSFORM_TENSOR_DESC
(
reshaped_q_desc
,
dimMerge
(
1
,
2
));
// full_k: [n_kv_head, head_dim, total_seq_len]
// full_k: [n_kv_head, head_dim, total_seq_len]
infiniopTensorDescriptor_t
full_k_desc
;
infiniopTensorDescriptor_t
full_k_desc
;
uint64
_t
full_k_shape
[
3
]
=
{
n_kv_head
,
total_seq_len
,
head_dim
};
size
_t
full_k_shape
[
3
]
=
{
n_kv_head
,
total_seq_len
,
head_dim
};
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
full_k_desc
,
3
,
full_k_shape
,
k_cache_desc
->
strides
().
data
(),
k_cache_desc
->
dtype
()));
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
full_k_desc
,
3
,
full_k_shape
,
k_cache_desc
->
strides
().
data
(),
k_cache_desc
->
dtype
()));
TRANSFORM_TENSOR_DESC
(
full_k_desc
,
dimPermute
({
0
,
2
,
1
}));
TRANSFORM_TENSOR_DESC
(
full_k_desc
,
dimPermute
({
0
,
2
,
1
}));
// qk: [n_kv_head, n_group * seq_len, total_seq_len]
// qk: [n_kv_head, n_group * seq_len, total_seq_len]
infiniopTensorDescriptor_t
qk_desc
;
infiniopTensorDescriptor_t
qk_desc
;
uint64
_t
qk_shape
[
3
]
=
{
n_kv_head
,
n_group
*
seq_len
,
total_seq_len
};
size
_t
qk_shape
[
3
]
=
{
n_kv_head
,
n_group
*
seq_len
,
total_seq_len
};
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
qk_desc
,
3
,
qk_shape
,
nullptr
,
q_desc
->
dtype
()));
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
qk_desc
,
3
,
qk_shape
,
nullptr
,
q_desc
->
dtype
()));
// matmul1_desc
// matmul1_desc
// qk_alpha
// qk_alpha
...
@@ -129,10 +129,11 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -129,10 +129,11 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
infiniopGemmDescriptor_t
matmul1_desc
;
infiniopGemmDescriptor_t
matmul1_desc
;
CHECK_STATUS
(
infiniopCreateGemmDescriptor
(
handle
,
&
matmul1_desc
,
qk_desc
,
reshaped_q_desc
,
full_k_desc
));
CHECK_STATUS
(
infiniopCreateGemmDescriptor
(
handle
,
&
matmul1_desc
,
qk_desc
,
reshaped_q_desc
,
full_k_desc
));
// matmul1 workspace size
// matmul1 workspace size
uint64
_t
matmul1_workspace_size
;
size
_t
matmul1_workspace_size
;
CHECK_STATUS
(
infiniopGetGemmWorkspaceSize
(
matmul1_desc
,
&
matmul1_workspace_size
));
CHECK_STATUS
(
infiniopGetGemmWorkspaceSize
(
matmul1_desc
,
&
matmul1_workspace_size
));
// matmul1 tensor size
matmul1_workspace_size
=
utils
::
align
(
matmul1_workspace_size
,
alignment
);
uint64_t
matmul1_tensor_size
=
qk_desc
->
numel
()
*
infiniSizeOf
(
qk_desc
->
dtype
());
// attention score tensor size
size_t
attn_score_size
=
utils
::
align
(
qk_desc
->
numel
()
*
infiniSizeOf
(
qk_desc
->
dtype
()),
alignment
);
// CausalSoftmax: softmax(qk)
// CausalSoftmax: softmax(qk)
// qk: [n_kv_head, n_group * seq_len, total_seq_len] -> [n_q_head, seq_len, total_seq_len]
// qk: [n_kv_head, n_group * seq_len, total_seq_len] -> [n_q_head, seq_len, total_seq_len]
...
@@ -141,8 +142,9 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -141,8 +142,9 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
infiniopCausalSoftmaxDescriptor_t
softmax_desc
;
infiniopCausalSoftmaxDescriptor_t
softmax_desc
;
CHECK_STATUS
(
infiniopCreateCausalSoftmaxDescriptor
(
handle
,
&
softmax_desc
,
qk_desc
,
qk_desc
));
CHECK_STATUS
(
infiniopCreateCausalSoftmaxDescriptor
(
handle
,
&
softmax_desc
,
qk_desc
,
qk_desc
));
// softmax workspace size
// softmax workspace size
uint64
_t
softmax_workspace_size
;
size
_t
softmax_workspace_size
;
CHECK_STATUS
(
infiniopGetCausalSoftmaxWorkspaceSize
(
softmax_desc
,
&
softmax_workspace_size
));
CHECK_STATUS
(
infiniopGetCausalSoftmaxWorkspaceSize
(
softmax_desc
,
&
softmax_workspace_size
));
softmax_workspace_size
=
utils
::
align
(
softmax_workspace_size
,
alignment
);
// Matmul2: softmax(qk) * full_v
// Matmul2: softmax(qk) * full_v
// softmax(qk): [n_q_head, seq_len, total_seq_len] -> [n_kv_head, n_group * seq_len, total_seq_len]
// softmax(qk): [n_q_head, seq_len, total_seq_len] -> [n_kv_head, n_group * seq_len, total_seq_len]
...
@@ -150,41 +152,44 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -150,41 +152,44 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
TRANSFORM_TENSOR_DESC
(
qk_desc
,
dimSplit
(
0
,
{
n_kv_head
,
n_group
}));
TRANSFORM_TENSOR_DESC
(
qk_desc
,
dimSplit
(
0
,
{
n_kv_head
,
n_group
}));
TRANSFORM_TENSOR_DESC
(
qk_desc
,
dimMerge
(
1
,
2
));
TRANSFORM_TENSOR_DESC
(
qk_desc
,
dimMerge
(
1
,
2
));
infiniopTensorDescriptor_t
full_v_desc
;
infiniopTensorDescriptor_t
full_v_desc
;
uint64
_t
full_v_shape
[
3
]
=
{
n_kv_head
,
total_seq_len
,
head_dim
};
size
_t
full_v_shape
[
3
]
=
{
n_kv_head
,
total_seq_len
,
head_dim
};
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
full_v_desc
,
3
,
full_v_shape
,
v_cache_desc
->
strides
().
data
(),
v_cache_desc
->
dtype
()));
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
full_v_desc
,
3
,
full_v_shape
,
v_cache_desc
->
strides
().
data
(),
v_cache_desc
->
dtype
()));
// temp_out: [n_kv_head, n_group * seq_len, head_dim]
// temp_out: [n_kv_head, n_group * seq_len, head_dim]
infiniopTensorDescriptor_t
temp_out
_desc
;
infiniopTensorDescriptor_t
att_val
_desc
;
uint64
_t
temp_out_shape
[
3
]
=
{
n_kv_head
,
n_group
*
seq_len
,
head_dim
};
size
_t
temp_out_shape
[
3
]
=
{
n_kv_head
,
n_group
*
seq_len
,
head_dim
};
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
temp_out
_desc
,
3
,
temp_out_shape
,
nullptr
,
q_desc
->
dtype
()));
CHECK_STATUS
(
infiniopCreateTensorDescriptor
(
&
att_val
_desc
,
3
,
temp_out_shape
,
nullptr
,
q_desc
->
dtype
()));
// matmul2_desc
// matmul2_desc
infiniopGemmDescriptor_t
matmul2_desc
;
infiniopGemmDescriptor_t
matmul2_desc
;
CHECK_STATUS
(
infiniopCreateGemmDescriptor
(
handle
,
&
matmul2_desc
,
temp_out
_desc
,
qk_desc
,
full_v_desc
));
CHECK_STATUS
(
infiniopCreateGemmDescriptor
(
handle
,
&
matmul2_desc
,
att_val
_desc
,
qk_desc
,
full_v_desc
));
// matmul2 workspace size
// matmul2 workspace size
uint64
_t
matmul2_workspace_size
;
size
_t
matmul2_workspace_size
;
CHECK_STATUS
(
infiniopGetGemmWorkspaceSize
(
matmul2_desc
,
&
matmul2_workspace_size
));
CHECK_STATUS
(
infiniopGetGemmWorkspaceSize
(
matmul2_desc
,
&
matmul2_workspace_size
));
// matmul2 tensor size
matmul2_workspace_size
=
utils
::
align
(
matmul2_workspace_size
,
alignment
);
uint64_t
matmul2_tensor_size
=
temp_out_desc
->
numel
()
*
infiniSizeOf
(
temp_out_desc
->
dtype
());
// attention value tensor size
size_t
att_val_size
=
utils
::
align
(
att_val_desc
->
numel
()
*
infiniSizeOf
(
att_val_desc
->
dtype
()),
alignment
);
// Rearrange temp_out into out
// Rearrange temp_out into out
// out: [seq_len, n_q_head, head_dim]
// out: [seq_len, n_q_head, head_dim]
// temp_out: [n_kv_head, n_group * seq_len, head_dim] -> [n_q_head, seq_len, head_dim] -> [seq_len, n_q_head, head_dim]
// temp_out: [n_kv_head, n_group * seq_len, head_dim] -> [n_q_head, seq_len, head_dim] -> [seq_len, n_q_head, head_dim]
TRANSFORM_TENSOR_DESC
(
temp_out
_desc
,
dimSplit
(
1
,
{
n_group
,
seq_len
}));
TRANSFORM_TENSOR_DESC
(
att_val
_desc
,
dimSplit
(
1
,
{
n_group
,
seq_len
}));
TRANSFORM_TENSOR_DESC
(
temp_out
_desc
,
dimMerge
(
0
,
1
));
TRANSFORM_TENSOR_DESC
(
att_val
_desc
,
dimMerge
(
0
,
1
));
TRANSFORM_TENSOR_DESC
(
temp_out
_desc
,
dimPermute
({
1
,
0
,
2
}));
TRANSFORM_TENSOR_DESC
(
att_val
_desc
,
dimPermute
({
1
,
0
,
2
}));
infiniopRearrangeDescriptor_t
rearrange_desc_out
;
infiniopRearrangeDescriptor_t
rearrange_desc_out
;
CHECK_STATUS
(
infiniopCreateRearrangeDescriptor
(
handle
,
&
rearrange_desc_out
,
out_desc
,
temp_out
_desc
));
CHECK_STATUS
(
infiniopCreateRearrangeDescriptor
(
handle
,
&
rearrange_desc_out
,
out_desc
,
att_val
_desc
));
// workspace size
// workspace size
uint64_t
workspace_size
=
rearranged_q_size
+
std
::
max
(
std
::
max
(
matmul1_workspace_size
+
matmul1_tensor_size
,
matmul1_tensor_size
+
softmax_workspace_size
),
matmul1_tensor_size
+
matmul2_workspace_size
+
matmul2_tensor_size
);
size_t
op_workspace_size
=
utils
::
align
(
std
::
max
(
std
::
max
(
matmul1_workspace_size
,
matmul2_workspace_size
),
softmax_workspace_size
),
alignment
);
size_t
temp_tensors_size
=
attn_score_size
+
std
::
max
(
q_cont_size
,
att_val_size
);
size_t
workspace_size
=
temp_tensors_size
+
op_workspace_size
;
// k_cache_offset
// k_cache_offset
uint64
_t
k_cache_offset
=
0
;
size
_t
k_cache_offset
=
0
;
if
(
pos
>
0
)
{
if
(
pos
>
0
)
{
k_cache_offset
=
pos
*
k_cache_desc
->
getByteStrides
()[
1
];
k_cache_offset
=
pos
*
k_cache_desc
->
getByteStrides
()[
1
];
}
}
// v_cache_offset
// v_cache_offset
uint64
_t
v_cache_offset
=
0
;
size
_t
v_cache_offset
=
0
;
if
(
pos
>
0
)
{
if
(
pos
>
0
)
{
v_cache_offset
=
pos
*
v_cache_desc
->
getByteStrides
()[
1
];
v_cache_offset
=
pos
*
v_cache_desc
->
getByteStrides
()[
1
];
}
}
...
@@ -200,12 +205,11 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -200,12 +205,11 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
matmul2_desc
,
matmul2_desc
,
softmax_desc
,
softmax_desc
,
workspace_size
,
workspace_size
,
rearranged_q_size
,
temp_tensors_size
,
matmul1_workspace_size
,
op_workspace_size
,
matmul1_tensor_size
,
attn_score_size
,
matmul2_workspace_size
,
0
,
matmul2_tensor_size
,
attn_score_size
,
softmax_workspace_size
,
k_cache_offset
,
k_cache_offset
,
v_cache_offset
,
v_cache_offset
,
1.
f
/
std
::
sqrt
(
float
(
head_dim
)),
1.
f
/
std
::
sqrt
(
float
(
head_dim
)),
...
@@ -214,14 +218,14 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
...
@@ -214,14 +218,14 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
__C
__export
infiniStatus_t
infiniopGetAttentionWorkspaceSize
(
infiniopAttentionDescriptor_t
desc
,
uint64
_t
*
size
)
{
__C
__export
infiniStatus_t
infiniopGetAttentionWorkspaceSize
(
infiniopAttentionDescriptor_t
desc
,
size
_t
*
size
)
{
*
size
=
((
InfiniopAttentionDescriptor
*
)
desc
)
->
workspace_size
;
*
size
=
((
InfiniopAttentionDescriptor
*
)
desc
)
->
workspace_size
;
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
__C
__export
infiniStatus_t
infiniopAttention
(
infiniopAttentionDescriptor_t
desc_
,
__C
__export
infiniStatus_t
infiniopAttention
(
infiniopAttentionDescriptor_t
desc_
,
void
*
workspace
,
void
*
workspace
_
,
uint64
_t
workspace_size
,
size
_t
workspace_size
_
,
void
*
out
,
void
*
out
,
void
const
*
q
,
void
const
*
q
,
void
const
*
k
,
void
const
*
k
,
...
@@ -230,11 +234,14 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
...
@@ -230,11 +234,14 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
void
*
v_cache
,
void
*
v_cache
,
void
*
stream
)
{
void
*
stream
)
{
auto
desc
=
(
InfiniopAttentionDescriptor
*
)
desc_
;
auto
desc
=
(
InfiniopAttentionDescriptor
*
)
desc_
;
void
*
workspace_
=
workspace
;
if
(
workspace_size_
<
desc
->
workspace_size
)
{
if
(
workspace_size
<
desc
->
workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
// STATUS_MEMORY_NOT_ALLOCATED
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
// STATUS_MEMORY_NOT_ALLOCATED
}
}
void
*
workspace
=
(
char
*
)
workspace_
+
desc
->
op_workspace_offset
;
size_t
workspace_size
=
desc
->
op_workspace_size
;
void
*
att_score
=
(
char
*
)
workspace_
+
desc
->
att_score_offset
;
void
*
att_val
=
(
char
*
)
workspace_
+
desc
->
att_val_offset
;
void
const
*
q_
=
q
;
// concat k and v to k_cache and v_cache
// concat k and v to k_cache and v_cache
CHECK_STATUS
(
infiniopRearrange
(
desc
->
rearrange_desc_k
,
CHECK_STATUS
(
infiniopRearrange
(
desc
->
rearrange_desc_k
,
(
char
*
)
k_cache
+
desc
->
k_cache_offset
,
k
,
stream
));
(
char
*
)
k_cache
+
desc
->
k_cache_offset
,
k
,
stream
));
...
@@ -243,28 +250,26 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
...
@@ -243,28 +250,26 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
(
char
*
)
v_cache
+
desc
->
v_cache_offset
,
v
,
stream
));
(
char
*
)
v_cache
+
desc
->
v_cache_offset
,
v
,
stream
));
// rearrange q into contiguous
// rearrange q into contiguous
void
const
*
_q
=
q
;
if
(
desc
->
rearrange_desc_q
)
{
if
(
desc
->
rearrange_desc_q
)
{
CHECK_STATUS
(
infiniopRearrange
(
desc
->
rearrange_desc_q
,
(
char
*
)
workspace_
,
q
,
stream
))
;
void
*
q_cont
=
(
char
*
)
workspace_
+
desc
->
q_cont_offset
;
_q
=
workspace_
;
CHECK_STATUS
(
infiniopRearrange
(
desc
->
rearrange_desc_q
,
q_cont
,
q
,
stream
))
;
workspace_
=
(
char
*
)
workspace_
+
desc
->
rearranged_q_size
;
q_
=
q_cont
;
}
}
// matmul1: q * full_k
// matmul1: q * full_k
CHECK_STATUS
(
infiniopGemm
(
desc
->
matmul_desc1
,
CHECK_STATUS
(
infiniopGemm
(
desc
->
matmul_desc1
,
(
char
*
)
workspace
_
+
desc
->
matmul1_tensor_size
,
workspace_size
-
desc
->
matmul1_tensor
_size
,
workspace
,
workspace
_size
,
workspace_
,
_q
,
k_cache
,
desc
->
qk_alpha
,
0.0
,
stream
));
att_score
,
q_
,
k_cache
,
desc
->
qk_alpha
,
0.0
,
stream
));
// softmax(qk)
// softmax(qk)
CHECK_STATUS
(
infiniopCausalSoftmax
(
desc
->
softmax_desc
,
CHECK_STATUS
(
infiniopCausalSoftmax
(
desc
->
softmax_desc
,
(
char
*
)
workspace
_
+
desc
->
matmul1_tensor_size
,
workspace_size
-
desc
->
matmul1_tensor
_size
,
workspace
,
workspace
_size
,
workspace_
,
workspace_
,
stream
));
att_score
,
att_score
,
stream
));
// matmul2: softmax(qk) * full_v
// matmul2: softmax(qk) * full_v
CHECK_STATUS
(
infiniopGemm
(
desc
->
matmul_desc2
,
CHECK_STATUS
(
infiniopGemm
(
desc
->
matmul_desc2
,
(
char
*
)
workspace_
+
desc
->
matmul1_tensor_size
+
desc
->
matmul2_tensor_size
,
workspace
,
workspace_size
,
workspace_size
-
desc
->
matmul1_tensor_size
-
desc
->
matmul2_tensor_size
,
att_val
,
att_score
,
v_cache
,
1.0
,
0.0
,
stream
));
(
char
*
)
workspace_
+
desc
->
matmul1_tensor_size
,
workspace_
,
v_cache
,
1.0
,
0.0
,
stream
));
// rearrange out
// rearrange out
CHECK_STATUS
(
infiniopRearrange
(
desc
->
rearrange_desc_out
,
out
,
(
char
*
)
workspace_
+
desc
->
matmul1_tensor_size
,
stream
));
CHECK_STATUS
(
infiniopRearrange
(
desc
->
rearrange_desc_out
,
out
,
att_val
,
stream
));
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
...
src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
View file @
ce2c4813
...
@@ -18,7 +18,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
...
@@ -18,7 +18,7 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
// [Reduce] Find max value in each row and store in shared memory
// [Reduce] Find max value in each row and store in shared memory
__shared__
Tdata
max_
;
__shared__
Tdata
max_
;
Tdata
max_0
=
op
::
common_cuda
::
reduce_op
::
max
<
BLOCK_SIZE
,
Tdata
>
(
x
,
width
);
Tdata
max_0
=
op
::
common_cuda
::
reduce_op
::
max
<
BLOCK_SIZE
,
Tdata
>
(
x
,
width
-
height
+
1
+
blockIdx
.
x
);
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
max_
=
max_0
;
max_
=
max_0
;
}
}
...
...
src/infiniop/ops/clip/cpu/clip_cpu.cc
0 → 100644
View file @
ce2c4813
#include "clip_cpu.h"
namespace
op
::
clip
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
in_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
min_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
max_desc
=
input_desc_vec
.
at
(
2
);
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
in_shape
=
in_desc
->
shape
();
const
auto
&
min_shape
=
min_desc
->
shape
();
const
auto
&
max_shape
=
max_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
out_shape
,
in_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
min_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
max_shape
);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
ClipOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
ClipOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
ClipOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::clip::cpu
src/infiniop/ops/clip/cpu/clip_cpu.h
0 → 100644
View file @
ce2c4813
#ifndef __CLIP_CPU_H__
#define __CLIP_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
cpu
)
namespace
op
::
clip
::
cpu
{
typedef
struct
ClipOp
{
public:
static
constexpr
size_t
num_inputs
=
3
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
,
const
T
&
min_val
,
const
T
&
max_val
)
const
{
return
std
::
max
(
std
::
min
(
x
,
max_val
),
min_val
);
}
}
ClipOp
;
}
// namespace op::clip::cpu
#endif // __CLIP_CPU_H__
src/infiniop/ops/clip/cuda/clip_cuda.cu
0 → 100644
View file @
ce2c4813
#include "clip_cuda.cuh"
#include "clip_cuda_internal.cuh"
namespace
op
::
clip
::
cuda
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
in_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
min_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
max_desc
=
input_desc_vec
.
at
(
2
);
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
in_shape
=
in_desc
->
shape
();
const
auto
&
min_shape
=
min_desc
->
shape
();
const
auto
&
max_shape
=
max_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
out_shape
,
in_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
min_shape
);
CHECK_SAME_SHAPE
(
out_shape
,
max_shape
);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::clip::cuda
src/infiniop/ops/clip/cuda/clip_cuda.cuh
0 → 100644
View file @
ce2c4813
#ifndef __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
cuda
)
#endif // __CLIP_CUDA_API_H__
src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
0 → 100644
View file @
ce2c4813
#ifndef __CLIP_CUDA_H__
#define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
clip
::
cuda
{
typedef
struct
ClipOp
{
public:
static
constexpr
size_t
num_inputs
=
3
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
const
T
&
min_val
,
const
T
&
max_val
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmax2
(
__hmin2
(
x
,
max_val
),
min_val
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmax
(
__hmin
(
x
,
max_val
),
min_val
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
fmaxf
(
fminf
(
x
,
max_val
),
min_val
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
fmax
(
fmin
(
x
,
max_val
),
min_val
);
}
else
{
return
std
::
max
(
std
::
min
(
x
,
max_val
),
min_val
);
}
}
}
ClipOp
;
}
// namespace op::clip::cuda
#endif // __CLIP_CUDA_H__
src/infiniop/ops/clip/operator.cc
0 → 100644
View file @
ce2c4813
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/clip.h"
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/clip_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
infiniopHandle_t
handle
,
infiniopClipDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
min_val
,
infiniopTensorDescriptor_t
max_val
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::clip::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
{x, min_val, max_val})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetClipWorkspaceSize
(
infiniopClipDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopClip
(
infiniopClipDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
min_val
,
const
void
*
max_val
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyClipDescriptor
(
infiniopClipDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
View file @
ce2c4813
...
@@ -3,6 +3,26 @@
...
@@ -3,6 +3,26 @@
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <cstring>
#include <unordered_map>
// Custom hash function for alpha beta pair<float, float>
struct
FloatPairHash
{
size_t
operator
()(
const
std
::
pair
<
float
,
float
>
&
p
)
const
{
uint64_t
combined
;
std
::
memcpy
(
reinterpret_cast
<
char
*>
(
&
combined
),
&
p
.
first
,
sizeof
(
float
));
std
::
memcpy
(
reinterpret_cast
<
char
*>
(
&
combined
)
+
sizeof
(
float
),
&
p
.
second
,
sizeof
(
float
));
return
std
::
hash
<
uint64_t
>
()(
combined
);
}
};
struct
FloatPairEqual
{
bool
operator
()(
const
std
::
pair
<
float
,
float
>
&
a
,
const
std
::
pair
<
float
,
float
>
&
b
)
const
{
return
a
.
first
==
b
.
first
&&
a
.
second
==
b
.
second
;
}
};
namespace
op
::
gemm
::
ascend
{
namespace
op
::
gemm
::
ascend
{
struct
Descriptor
::
Opaque
{
struct
Descriptor
::
Opaque
{
...
@@ -11,11 +31,17 @@ struct Descriptor::Opaque {
...
@@ -11,11 +31,17 @@ struct Descriptor::Opaque {
// see doc:
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t
mt
;
int8_t
mt
;
// alpha&beta hashmap
std
::
unordered_map
<
std
::
pair
<
float
,
float
>
,
aclOpExecutor
*
,
FloatPairHash
,
FloatPairEqual
>
lookup
;
~
Opaque
()
{
~
Opaque
()
{
delete
c
;
delete
c
;
delete
a
;
delete
a
;
delete
b
;
delete
b
;
for
(
auto
&
item
:
lookup
)
{
aclDestroyAclOpExecutor
(
item
.
second
);
}
lookup
.
clear
();
}
}
};
};
...
@@ -54,15 +80,16 @@ infiniStatus_t Descriptor::create(
...
@@ -54,15 +80,16 @@ infiniStatus_t Descriptor::create(
ta
=
a
->
tensor
,
ta
=
a
->
tensor
,
tb
=
b
->
tensor
;
tb
=
b
->
tensor
;
std
::
unordered_map
<
std
::
pair
<
float
,
float
>
,
aclOpExecutor
*
,
FloatPairHash
,
FloatPairEqual
>
lookup
;
aclOpExecutor
*
executor
=
nullptr
;
aclOpExecutor
*
executor
=
nullptr
;
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
int8_t
mt
=
1
;
int8_t
mt
=
1
;
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
.5
,
.5
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
1.
,
0.
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
lookup
[
std
::
make_pair
(
1.0
f
,
0.0
f
)]
=
executor
;
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
1.
,
1.
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
lookup
[
std
::
make_pair
(
1.0
f
,
1.0
f
)]
=
executor
;
*
desc_ptr
=
new
Descriptor
(
*
desc_ptr
=
new
Descriptor
(
dtype
,
info
,
workspace_size
,
dtype
,
info
,
workspace_size
,
...
@@ -71,11 +98,9 @@ infiniStatus_t Descriptor::create(
...
@@ -71,11 +98,9 @@ infiniStatus_t Descriptor::create(
a
,
a
,
b
,
b
,
mt
,
mt
,
},
std
::
move
(
lookup
)
},
handle
->
device
,
handle
->
device_id
);
handle
->
device
,
handle
->
device_id
);
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
@@ -93,16 +118,22 @@ infiniStatus_t Descriptor::calculate(
...
@@ -93,16 +118,22 @@ infiniStatus_t Descriptor::calculate(
ta
=
_opaque
->
a
->
tensor
,
ta
=
_opaque
->
a
->
tensor
,
tb
=
_opaque
->
b
->
tensor
;
tb
=
_opaque
->
b
->
tensor
;
size_t
workspace_size
=
0
;
size_t
workspace_size
=
_workspace_size
;
aclOpExecutor
*
executor
=
nullptr
;
aclOpExecutor
*
executor
;
auto
key
=
std
::
make_pair
(
alpha
,
beta
);
if
(
_opaque
->
lookup
.
find
(
key
)
!=
_opaque
->
lookup
.
end
())
{
executor
=
_opaque
->
lookup
[
key
];
}
else
{
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
alpha
,
beta
,
0
,
0
,
tc
,
_opaque
->
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
_opaque
->
lookup
[
key
]
=
executor
;
}
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
alpha
,
beta
,
0
,
0
,
tc
,
_opaque
->
mt
,
&
workspace_size
,
&
executor
));
if
(
workspaceSize_
<
workspace_size
)
{
if
(
workspaceSize_
<
workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
}
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
auto
unit
=
infiniSizeOf
(
_dtype
);
auto
unit
=
infiniSizeOf
(
_dtype
);
for
(
size_t
i
=
0
;
i
<
_info
.
batch
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
_info
.
batch
;
++
i
)
{
...
...
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
View file @
ce2c4813
...
@@ -26,10 +26,12 @@ infiniStatus_t Descriptor::create(
...
@@ -26,10 +26,12 @@ infiniStatus_t Descriptor::create(
auto
info
=
result
.
take
();
auto
info
=
result
.
take
();
size_t
workspace_size
;
size_t
workspace_size
;
#define CASE_P(CASE, Tidx, Tval) \
#define CASE_P(CASE, Tidx, Tval) \
case CASE: \
case CASE: { \
workspace_size = calculateWorkspace<Tidx, Tval>(info.n); \
auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
break
CHECK_RESULT(workspace_result); \
workspace_size = workspace_result.take(); \
} break
#define CASE_I(CASE, Tidx) \
#define CASE_I(CASE, Tidx) \
case CASE: \
case CASE: \
...
...
src/infiniop/ops/rope/ascend/rope_ascend.cc
0 → 100644
View file @
ce2c4813
#include "rope_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace
op
::
rope
::
ascend
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
auto
handle_ascned
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
auto
result
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
CHECK_RESULT
(
result
);
size_t
workspace_size
=
0
;
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
result
.
take
()),
workspace_size
,
nullptr
,
handle_ascned
->
device
,
handle_ascned
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
pos_ids
,
const
void
*
sin_table
,
const
void
*
cos_table
,
void
*
stream
)
const
{
CHECK_DTYPE
(
_info
.
data_type
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F16
);
auto
data_type
=
_info
.
data_type
;
auto
pos_type
=
_info
.
pos_type
;
auto
seq_len
=
_info
.
seqlen
;
auto
nhead
=
_info
.
nhead
;
auto
dhead
=
_info
.
dhead
;
auto
y_stride_seqlen
=
_info
.
y_stride_seqlen
;
auto
y_stride_nhead
=
_info
.
y_stride_nhead
;
auto
x_stride_seqlen
=
_info
.
x_stride_seqlen
;
auto
x_stride_nhead
=
_info
.
x_stride_nhead
;
return
rope_kernel_launch
(
y
,
(
void
*
)
x
,
(
void
*
)
pos_ids
,
(
void
*
)
sin_table
,
(
void
*
)
cos_table
,
seq_len
,
nhead
,
dhead
,
data_type
,
pos_type
,
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
,
stream
);
}
}
// namespace op::rope::ascend
src/infiniop/ops/rope/ascend/rope_ascend.h
0 → 100644
View file @
ce2c4813
#ifndef __ACLNN_ROPE_H__
#define __ACLNN_ROPE_H__
#include "../rope.h"
extern
"C"
infiniStatus_t
rope_kernel_launch
(
void
*
y
,
void
*
x
,
void
*
pos
,
void
*
sin
,
void
*
cos
,
size_t
seq_len
,
size_t
nhead
,
size_t
dhead
,
infiniDtype_t
data_type
,
infiniDtype_t
pos_type
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
,
void
*
stream
);
DESCRIPTOR
(
ascend
)
#endif // __ACLNN_ROPE_H__
src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
0 → 100644
View file @
ce2c4813
#include "../../../devices/ascend/ascend_kernel_common.h"
using
namespace
AscendC
;
template
<
typename
T
,
typename
U
>
class
RoPEKernel
{
public:
__aicore__
inline
RoPEKernel
()
{}
// Init op
// pos position vector
// x input tensor
// y output tensor
// tensor shape [nt, nh, dh]
// make block_num = nh, tile_len = dh
__aicore__
inline
void
init
(
GM_ADDR
y
,
GM_ADDR
x
,
GM_ADDR
pos
,
GM_ADDR
sin
,
GM_ADDR
cos
,
size_t
dh
,
ptrdiff_t
st_ynt
,
ptrdiff_t
st_ynh
,
ptrdiff_t
st_xnt
,
ptrdiff_t
st_xnh
);
__aicore__
inline
void
process
(
size_t
seq_len
);
private:
// Copy a tile into UB
__aicore__
inline
void
copyIn
(
size_t
i
);
__aicore__
inline
void
compute
(
size_t
i
);
__aicore__
inline
void
copyOut
(
size_t
i
);
private:
TPipe
pipe
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_in_que
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_sin_que
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_cos_que
;
TQue
<
QuePosition
::
VECOUT
,
BUFFER_NUM
>
_out_que
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf1
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf2
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf1
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf2
;
GlobalTensor
<
T
>
_x_gm
,
_y_gm
;
GlobalTensor
<
U
>
_p_gm
;
GlobalTensor
<
T
>
_sin_gm
;
GlobalTensor
<
T
>
_cos_gm
;
size_t
_block_idx
;
size_t
_tile_len
;
size_t
_copy_len
;
size_t
_half_copy_len
;
// stridey[_st_ynt, _st_ynh, 1]
ptrdiff_t
_st_ynt
;
ptrdiff_t
_st_ynh
;
// stridex[_st_xnt, _st_xnh, 1]
ptrdiff_t
_st_xnt
;
ptrdiff_t
_st_xnh
;
};
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
init
(
GM_ADDR
y
,
GM_ADDR
x
,
GM_ADDR
pos
,
GM_ADDR
sin
,
GM_ADDR
cos
,
size_t
dh
,
ptrdiff_t
st_ynt
,
ptrdiff_t
st_ynh
,
ptrdiff_t
st_xnt
,
ptrdiff_t
st_xnh
)
{
this
->
_tile_len
=
dh
;
this
->
_st_ynt
=
st_ynt
;
this
->
_st_ynh
=
st_ynh
;
this
->
_st_xnt
=
st_xnt
;
this
->
_st_xnh
=
st_xnh
;
_copy_len
=
alignTileLen
<
T
>
(
dh
,
BYTE_ALIGN
);
_half_copy_len
=
alignTileLen
<
T
>
(
dh
,
BYTE_ALIGN
);
_block_idx
=
GetBlockIdx
();
// Init global buffer
_x_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
x
);
_p_gm
.
SetGlobalBuffer
((
__gm__
U
*
)
pos
);
_sin_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
sin
);
_cos_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
cos
);
_y_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
y
);
// Init Queue buffer
pipe
.
InitBuffer
(
_in_que
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_out_que
,
BUFFER_NUM
,
_tile_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_sin_que
,
BUFFER_NUM
,
_half_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_cos_que
,
BUFFER_NUM
,
_half_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf1
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf2
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf1
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf2
,
_tile_len
/
2
*
sizeof
(
T
));
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
copyIn
(
size_t
i
)
{
LocalTensor
<
T
>
input_ub
=
_in_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
sin_ub
=
_sin_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
cos_ub
=
_cos_que
.
AllocTensor
<
T
>
();
// Get idx of current tile in total input
auto
idx
=
i
*
_st_xnt
+
_block_idx
*
_st_xnh
;
// Copy tile current tile into UB
DataCopy
(
input_ub
,
_x_gm
[
idx
],
_copy_len
);
// Copy sin cos tile
auto
pos_idx
=
_p_gm
(
i
);
DataCopy
(
sin_ub
,
_sin_gm
[
pos_idx
*
_tile_len
/
2
],
_half_copy_len
);
DataCopy
(
cos_ub
,
_cos_gm
[
pos_idx
*
_tile_len
/
2
],
_half_copy_len
);
// Push in operands
_in_que
.
EnQue
(
input_ub
);
_sin_que
.
EnQue
(
sin_ub
);
_cos_que
.
EnQue
(
cos_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
compute
(
size_t
i
)
{
LocalTensor
<
T
>
input_ub
=
_in_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
sin_ub
=
_sin_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
cos_ub
=
_cos_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
output_ub
=
_out_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
tmp_odd
=
_tmp_odd_buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even
=
_tmp_even_buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_odd1
=
_tmp_odd_buf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_odd2
=
_tmp_odd_buf2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even1
=
_tmp_even_buf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even2
=
_tmp_even_buf2
.
Get
<
T
>
();
// separate odd and even bit elements
uint64_t
rsvdCnt
=
0
;
GatherMaskParams
gMaskParams
=
{
1
,
static_cast
<
uint16_t
>
((
_tile_len
*
sizeof
(
T
)
+
255
)
/
256
),
// no more than 256(<=255)
8
,
8
,
};
GatherMask
<
T
>
(
tmp_odd
,
input_ub
,
1
,
false
,
0
,
gMaskParams
,
rsvdCnt
);
GatherMask
<
T
>
(
tmp_even
,
input_ub
,
2
,
false
,
0
,
gMaskParams
,
rsvdCnt
);
PipeBarrier
<
PIPE_V
>
();
// compute odd bit elements
// y_odd = x_odd * cos - x_even * sin
Mul
<
T
>
(
tmp_odd1
,
tmp_odd
,
cos_ub
,
_tile_len
/
2
);
Mul
<
T
>
(
tmp_odd2
,
tmp_even
,
sin_ub
,
_tile_len
/
2
);
PipeBarrier
<
PIPE_V
>
();
Sub
<
T
>
(
tmp_odd1
,
tmp_odd1
,
tmp_odd2
,
_tile_len
/
2
);
// compute even bit elements
// y_even = x_odd * sin + x_even * cos
Mul
<
T
>
(
tmp_even1
,
tmp_odd
,
sin_ub
,
_tile_len
/
2
);
Mul
<
T
>
(
tmp_even2
,
tmp_even
,
cos_ub
,
_tile_len
/
2
);
PipeBarrier
<
PIPE_V
>
();
Add
<
T
>
(
tmp_even1
,
tmp_even1
,
tmp_even2
,
_tile_len
/
2
);
// combine odd and even bit elements
for
(
uint32_t
j
=
0
;
j
<
_tile_len
/
2
;
j
+=
1
)
{
output_ub
(
j
*
2
)
=
tmp_odd1
(
j
);
output_ub
(
j
*
2
+
1
)
=
tmp_even1
(
j
);
}
_out_que
.
EnQue
<
T
>
(
output_ub
);
_in_que
.
FreeTensor
(
input_ub
);
_sin_que
.
FreeTensor
(
sin_ub
);
_cos_que
.
FreeTensor
(
cos_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
copyOut
(
size_t
i
)
{
LocalTensor
<
T
>
output_ub
=
_out_que
.
DeQue
<
T
>
();
auto
idy
=
i
*
_st_ynt
+
_block_idx
*
_st_ynh
;
DataCopyExtParams
params
=
{
1
,
static_cast
<
uint32_t
>
(
_tile_len
*
sizeof
(
T
)),
0
,
0
,
0
};
DataCopyPad
(
_y_gm
[
idy
],
output_ub
,
params
);
_out_que
.
FreeTensor
(
output_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
process
(
size_t
seq_len
)
{
for
(
size_t
i
=
0
;
i
<
seq_len
;
++
i
)
{
copyIn
(
i
);
compute
(
i
);
copyOut
(
i
);
}
}
#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead, \
y_stride_seqlen, y_stride_nhead, \
x_stride_seqlen, x_stride_nhead
#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
case POS_TYPE_ENUM: { \
RoPEKernel<TYPE, POS_T> op; \
op.init(ROPE_KERNEL_INIT_ARGS); \
op.process(seq_len); \
break; \
}
#define ROPE_KERNEL(TYPE, POSTYPE) \
switch (POSTYPE) { \
CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t) \
CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t) \
CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t) \
CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t) \
CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t) \
CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
default: \
break; \
}
#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR y, \
GM_ADDR x, \
GM_ADDR pos, \
GM_ADDR sin, \
GM_ADDR cos, \
size_t seq_len, \
size_t dhead, \
ptrdiff_t y_stride_seqlen, \
ptrdiff_t y_stride_nhead, \
ptrdiff_t x_stride_seqlen, \
ptrdiff_t x_stride_nhead, \
int32_t pos_type) { \
ROPE_KERNEL(TYPE, pos_type) \
}
DEFINE_ROPE_KERNEL
(
rope_kernel_float
,
float
)
DEFINE_ROPE_KERNEL
(
rope_kernel_half
,
half
)
#undef DEFINE_ROPE_KERNEL
#undef ROPE_KERNEL
#undef CASE_POSTYPE
#undef ROPE_KERNEL_INIT_ARGS
extern
"C"
infiniStatus_t
rope_kernel_launch
(
void
*
y
,
void
*
x
,
void
*
pos
,
void
*
sin
,
void
*
cos
,
size_t
seq_len
,
size_t
nhead
,
size_t
dhead
,
infiniDtype_t
dtype
,
infiniDtype_t
pos_type
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
,
void
*
stream
)
{
#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
seq_len, \
dhead, \
y_stride_seqlen, \
y_stride_nhead, \
x_stride_seqlen, \
x_stride_nhead, \
pos_type); \
return INFINI_STATUS_SUCCESS;
switch
(
dtype
)
{
LAUNCH_ROPE_KERNEL
(
INFINI_DTYPE_F16
,
rope_kernel_half
)
LAUNCH_ROPE_KERNEL
(
INFINI_DTYPE_F32
,
rope_kernel_float
)
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment