Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2461ea9d
Commit
2461ea9d
authored
Dec 23, 2025
by
zhuwenwen
Browse files
update flash_mla_with_kvcache
parent
8bfd0bde
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
13 deletions
+27
-13
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+27
-13
No files found.
vllm/v1/attention/backends/mla/flashmla.py
View file @
2461ea9d
...
...
@@ -33,6 +33,7 @@ from vllm.v1.attention.backends.utils import (
reshape_query_for_spec_decode
,
)
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -298,6 +299,19 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
# zeros of length B+1
num_splits
=
torch
.
zeros
((
B
+
1
,),
dtype
=
dtype
,
device
=
device
)
if
current_platform
.
is_rocm
():
o
,
lse
=
flash_mla_with_kvcache
(
q
=
q
,
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
block_table
=
attn_metadata
.
decode
.
block_table
,
cache_seqlens
=
attn_metadata
.
decode
.
seq_lens
,
head_dim_v
=
self
.
kv_lora_rank
,
tile_scheduler_metadata
=
tile_scheduler_metadata
,
num_splits
=
num_splits
,
softmax_scale
=
self
.
scale
,
causal
=
True
,
)
else
:
o
,
lse
=
flash_mla_with_kvcache
(
q
=
q
,
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment