Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
56e19d7e
Unverified
Commit
56e19d7e
authored
Apr 09, 2026
by
Wentao Ye
Committed by
GitHub
Apr 09, 2026
Browse files
[Model Runner V2] Fix flex attention kv blocks calculation issue (#39353)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
9036d4c4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
6 additions
and
9 deletions
+6
-9
vllm/v1/attention/backends/flex_attention.py
vllm/v1/attention/backends/flex_attention.py
+6
-9
No files found.
vllm/v1/attention/backends/flex_attention.py
View file @
56e19d7e
...
@@ -750,11 +750,11 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
...
@@ -750,11 +750,11 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
self
.
max_model_len
=
self
.
model_config
.
max_model_len
self
.
max_model_len
=
self
.
model_config
.
max_model_len
max_num_seqs
=
vllm_config
.
scheduler_config
.
max_num_seqs
max_num_seqs
=
vllm_config
.
scheduler_config
.
max_num_seqs
max_num_batched_tokens
=
vllm_config
.
scheduler_config
.
max_num_batched_tokens
max_num_batched_tokens
=
vllm_config
.
scheduler_config
.
max_num_batched_tokens
self
.
max_num_q
_block
=
(
self
.
max_num_q
uery_groups
=
cdiv
(
max_num_batched_tokens
,
self
.
q_block_size
)
self
.
max_model_len
+
self
.
q_
block_size
-
1
max_num_pages_per_seq
=
cdiv
(
self
.
max_model_len
,
self
.
block_size
)
)
//
self
.
q_block_size
self
.
max_num_kv_indices
=
self
.
q_block_size
*
max_num_pages_per_seq
self
.
persistent_kv_num_blocks
=
torch
.
empty
(
self
.
persistent_kv_num_blocks
=
torch
.
empty
(
self
.
max_num_q
_block
,
dtype
=
torch
.
int32
,
device
=
device
self
.
max_num_q
uery_groups
,
dtype
=
torch
.
int32
,
device
=
device
)
)
self
.
persistent_offset_tensor
=
torch
.
empty
(
self
.
persistent_offset_tensor
=
torch
.
empty
(
max_num_seqs
,
dtype
=
torch
.
int32
,
device
=
device
max_num_seqs
,
dtype
=
torch
.
int32
,
device
=
device
...
@@ -828,12 +828,9 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
...
@@ -828,12 +828,9 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
)
)
if
self
.
persistent_kv_indices
is
None
:
if
self
.
persistent_kv_indices
is
None
:
max_num_kv_block
=
(
self
.
max_model_len
+
self
.
kv_block_size
-
1
)
//
self
.
kv_block_size
self
.
persistent_kv_indices
=
torch
.
empty
(
self
.
persistent_kv_indices
=
torch
.
empty
(
self
.
max_
model_len
,
self
.
max_
num_query_groups
,
max_num_kv_
block
,
self
.
max_num_kv_
indices
,
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
device
=
self
.
device
,
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment