Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
e13ad8f9
Commit
e13ad8f9
authored
Dec 30, 2025
by
PanZezhong
Browse files
issue/847 correct cache_lens naming
parent
31c0af3f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
31 additions
and
31 deletions
+31
-31
include/infinicore/ops/paged_attention.hpp
include/infinicore/ops/paged_attention.hpp
+3
-3
python/infinicore/ops/paged_attention.py
python/infinicore/ops/paged_attention.py
+3
-3
src/infinicore/ops/paged_attention/paged_attention.cc
src/infinicore/ops/paged_attention/paged_attention.cc
+7
-7
src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
...nfinicore/ops/paged_attention/paged_attention_infiniop.cc
+4
-4
src/infinicore/pybind11/ops/paged_attention.hpp
src/infinicore/pybind11/ops/paged_attention.hpp
+6
-6
test/infinicore/ops/paged_attention.py
test/infinicore/ops/paged_attention.py
+8
-8
No files found.
include/infinicore/ops/paged_attention.hpp
View file @
e13ad8f9
...
...
@@ -9,10 +9,10 @@ namespace infinicore::op {
class
PagedAttention
{
public:
using
schema
=
void
(
*
)(
Tensor
,
Tensor
,
Tensor
,
Tensor
,
Tensor
,
Tensor
,
std
::
optional
<
Tensor
>
,
float
);
static
void
execute
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
);
static
void
execute
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
);
static
common
::
OpDispatcher
<
schema
>
&
dispatcher
();
};
Tensor
paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
);
void
paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
);
Tensor
paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
);
void
paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
);
}
// namespace infinicore::op
python/infinicore/ops/paged_attention.py
View file @
e13ad8f9
...
...
@@ -7,7 +7,7 @@ def paged_attention(
k_cache
:
Tensor
,
v_cache
:
Tensor
,
block_tables
:
Tensor
,
seq
_lens
:
Tensor
,
cache
_lens
:
Tensor
,
alibi_slopes
:
Tensor
|
None
=
None
,
scale
:
float
=
1.0
,
*
,
...
...
@@ -20,7 +20,7 @@ def paged_attention(
k_cache
.
_underlying
,
v_cache
.
_underlying
,
block_tables
.
_underlying
,
seq
_lens
.
_underlying
,
cache
_lens
.
_underlying
,
alibi_slopes
.
_underlying
if
alibi_slopes
is
not
None
else
None
,
scale
,
)
...
...
@@ -32,7 +32,7 @@ def paged_attention(
k_cache
.
_underlying
,
v_cache
.
_underlying
,
block_tables
.
_underlying
,
seq
_lens
.
_underlying
,
cache
_lens
.
_underlying
,
alibi_slopes
.
_underlying
if
alibi_slopes
is
not
None
else
None
,
scale
,
)
...
...
src/infinicore/ops/paged_attention/paged_attention.cc
View file @
e13ad8f9
...
...
@@ -9,20 +9,20 @@ common::OpDispatcher<PagedAttention::schema> &PagedAttention::dispatcher() {
return
dispatcher_
;
};
void
PagedAttention
::
execute
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
);
void
PagedAttention
::
execute
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
);
infinicore
::
context
::
setDevice
(
out
->
device
());
dispatcher
().
lookup
(
out
->
device
().
getType
())(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
,
alibi_slopes
,
scale
);
dispatcher
().
lookup
(
out
->
device
().
getType
())(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
,
alibi_slopes
,
scale
);
}
Tensor
paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
Tensor
paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
auto
out
=
Tensor
::
empty
(
q
->
shape
(),
q
->
dtype
(),
q
->
device
());
paged_attention_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
,
alibi_slopes
,
scale
);
paged_attention_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
,
alibi_slopes
,
scale
);
return
out
;
}
void
paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
PagedAttention
::
execute
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
,
alibi_slopes
,
scale
);
void
paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
PagedAttention
::
execute
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
,
alibi_slopes
,
scale
);
}
}
// namespace infinicore::op
src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
View file @
e13ad8f9
...
...
@@ -15,8 +15,8 @@ thread_local common::OpCache<size_t, infiniopPagedAttentionDescriptor_t> caches(
}
});
void
calculate
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
);
void
calculate
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
...
...
@@ -27,7 +27,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreatePagedAttentionDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
out
->
desc
(),
q
->
desc
(),
k_cache
->
desc
(),
v_cache
->
desc
(),
block_tables
->
desc
(),
seq
_lens
->
desc
(),
out
->
desc
(),
q
->
desc
(),
k_cache
->
desc
(),
v_cache
->
desc
(),
block_tables
->
desc
(),
cache
_lens
->
desc
(),
alibi_slopes
.
has_value
()
?
alibi_slopes
.
value
()
->
desc
()
:
nullptr
,
scale
));
cache
.
put
(
seed
,
desc
);
...
...
@@ -41,7 +41,7 @@ void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor bloc
INFINICORE_CHECK_ERROR
(
infiniopPagedAttention
(
desc
,
workspace
->
data
(),
workspace_size
,
out
->
data
(),
q
->
data
(),
k_cache
->
data
(),
v_cache
->
data
(),
block_tables
->
data
(),
seq
_lens
->
data
(),
out
->
data
(),
q
->
data
(),
k_cache
->
data
(),
v_cache
->
data
(),
block_tables
->
data
(),
cache
_lens
->
data
(),
alibi_slopes
.
has_value
()
?
alibi_slopes
.
value
()
->
data
()
:
nullptr
,
context
::
getStream
()));
}
...
...
src/infinicore/pybind11/ops/paged_attention.hpp
View file @
e13ad8f9
...
...
@@ -8,21 +8,21 @@ namespace py = pybind11;
namespace
infinicore
::
ops
{
Tensor
py_paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
pybind11
::
object
alibi_slopes
,
float
scale
)
{
Tensor
py_paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
pybind11
::
object
alibi_slopes
,
float
scale
)
{
std
::
optional
<
Tensor
>
alibi_slopes_tensor
=
std
::
nullopt
;
if
(
!
alibi_slopes
.
is_none
())
{
alibi_slopes_tensor
=
alibi_slopes
.
cast
<
Tensor
>
();
}
return
op
::
paged_attention
(
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
,
alibi_slopes_tensor
,
scale
);
return
op
::
paged_attention
(
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
,
alibi_slopes_tensor
,
scale
);
}
void
py_paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
seq
_lens
,
pybind11
::
object
alibi_slopes
,
float
scale
)
{
void
py_paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
cache
_lens
,
pybind11
::
object
alibi_slopes
,
float
scale
)
{
std
::
optional
<
Tensor
>
alibi_slopes_tensor
=
std
::
nullopt
;
if
(
!
alibi_slopes
.
is_none
())
{
alibi_slopes_tensor
=
alibi_slopes
.
cast
<
Tensor
>
();
}
op
::
paged_attention_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
seq
_lens
,
alibi_slopes_tensor
,
scale
);
op
::
paged_attention_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache
_lens
,
alibi_slopes_tensor
,
scale
);
}
inline
void
bind_paged_attention
(
py
::
module
&
m
)
{
...
...
@@ -32,7 +32,7 @@ inline void bind_paged_attention(py::module &m) {
py
::
arg
(
"k_cache"
),
py
::
arg
(
"v_cache"
),
py
::
arg
(
"block_tables"
),
py
::
arg
(
"
seq
_lens"
),
py
::
arg
(
"
cache
_lens"
),
py
::
arg
(
"alibi_slopes"
),
py
::
arg
(
"scale"
),
R"doc(Paged attention of query and key cache tensors.)doc"
);
...
...
@@ -44,7 +44,7 @@ inline void bind_paged_attention(py::module &m) {
py
::
arg
(
"k_cache"
),
py
::
arg
(
"v_cache"
),
py
::
arg
(
"block_tables"
),
py
::
arg
(
"
seq
_lens"
),
py
::
arg
(
"
cache
_lens"
),
py
::
arg
(
"alibi_slopes"
),
py
::
arg
(
"scale"
),
R"doc(In-place paged attention of query and key cache tensors.)doc"
);
...
...
test/infinicore/ops/paged_attention.py
View file @
e13ad8f9
...
...
@@ -62,7 +62,7 @@ def parse_test_cases():
max_blocks_per_seq
=
(
max_seq_len
+
block_size
-
1
)
//
block_size
num_blocks
=
num_seqs
*
max_blocks_per_seq
# A reasonable number for testing
seq
_lens_torch
=
torch
.
randint
(
1
,
max_seq_len
,
(
num_seqs
,),
dtype
=
torch
.
int64
)
cache
_lens_torch
=
torch
.
randint
(
1
,
max_seq_len
,
(
num_seqs
,),
dtype
=
torch
.
int64
)
block_tables
=
torch
.
arange
(
0
,
num_seqs
*
max_blocks_per_seq
,
dtype
=
torch
.
int64
...
...
@@ -75,7 +75,7 @@ def parse_test_cases():
v_cache_shape
=
(
num_blocks
,
num_kv_heads
,
block_size
,
head_size
)
block_tables_shape
=
block_tables
.
shape
seq
_lens_shape
=
seq
_lens_torch
.
shape
cache
_lens_shape
=
cache
_lens_torch
.
shape
# Generate test cases for all data types
for
dtype
in
_TENSOR_DTYPES
:
...
...
@@ -91,10 +91,10 @@ def parse_test_cases():
set_tensor
=
block_tables
,
dtype
=
infinicore
.
int64
,
)
seq
_lens_spec
=
TensorSpec
.
from_tensor
(
seq
_lens_shape
,
cache
_lens_spec
=
TensorSpec
.
from_tensor
(
cache
_lens_shape
,
init_mode
=
TensorInitializer
.
MANUAL
,
set_tensor
=
seq
_lens_torch
,
set_tensor
=
cache
_lens_torch
,
dtype
=
infinicore
.
int64
,
)
...
...
@@ -108,7 +108,7 @@ def parse_test_cases():
k_cache_spec
,
v_cache_spec
,
block_tables_spec
,
seq
_lens_spec
,
cache
_lens_spec
,
],
kwargs
=
{
"alibi_slopes"
:
None
,
"scale"
:
scale
},
output_spec
=
None
,
...
...
@@ -132,7 +132,7 @@ def ref_masked_attention(query, key, value, scale, attn_mask=None):
def
ref_single_query_cached_kv_attention
(
query
,
key_cache
,
value_cache
,
block_tables
,
seq
_lens
,
alibi_slopes
,
scale
query
,
key_cache
,
value_cache
,
block_tables
,
cache
_lens
,
alibi_slopes
,
scale
):
# Reference implementation for paged attention, iterating through each sequence.
output
=
torch
.
empty_like
(
query
)
...
...
@@ -143,7 +143,7 @@ def ref_single_query_cached_kv_attention(
for
i
in
range
(
num_seqs
):
q
=
query
[
i
].
unsqueeze
(
0
)
seq_len
=
seq
_lens
[
i
].
item
()
seq_len
=
cache
_lens
[
i
].
item
()
block_table
=
block_tables
[
i
]
keys_lst
,
values_lst
=
[],
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment