Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2461ea9d
Commit
2461ea9d
authored
Dec 23, 2025
by
zhuwenwen
Browse files
update flash_mla_with_kvcache
parent
8bfd0bde
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
13 deletions
+27
-13
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+27
-13
No files found.
vllm/v1/attention/backends/mla/flashmla.py
View file @
2461ea9d
...
@@ -33,6 +33,7 @@ from vllm.v1.attention.backends.utils import (
...
@@ -33,6 +33,7 @@ from vllm.v1.attention.backends.utils import (
reshape_query_for_spec_decode
,
reshape_query_for_spec_decode
,
)
)
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -298,19 +299,32 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
...
@@ -298,19 +299,32 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
# zeros of length B+1
# zeros of length B+1
num_splits
=
torch
.
zeros
((
B
+
1
,),
dtype
=
dtype
,
device
=
device
)
num_splits
=
torch
.
zeros
((
B
+
1
,),
dtype
=
dtype
,
device
=
device
)
o
,
lse
=
flash_mla_with_kvcache
(
if
current_platform
.
is_rocm
():
q
=
q
,
o
,
lse
=
flash_mla_with_kvcache
(
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
q
=
q
,
block_table
=
attn_metadata
.
decode
.
block_table
,
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
cache_seqlens
=
attn_metadata
.
decode
.
seq_lens
,
block_table
=
attn_metadata
.
decode
.
block_table
,
head_dim_v
=
self
.
kv_lora_rank
,
cache_seqlens
=
attn_metadata
.
decode
.
seq_lens
,
tile_scheduler_metadata
=
tile_scheduler_metadata
,
head_dim_v
=
self
.
kv_lora_rank
,
num_splits
=
num_splits
,
tile_scheduler_metadata
=
tile_scheduler_metadata
,
softmax_scale
=
self
.
scale
,
num_splits
=
num_splits
,
causal
=
True
,
softmax_scale
=
self
.
scale
,
descale_q
=
layer
.
_q_scale
.
reshape
(
1
),
causal
=
True
,
descale_k
=
layer
.
_k_scale
.
reshape
(
1
),
)
)
else
:
o
,
lse
=
flash_mla_with_kvcache
(
q
=
q
,
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
block_table
=
attn_metadata
.
decode
.
block_table
,
cache_seqlens
=
attn_metadata
.
decode
.
seq_lens
,
head_dim_v
=
self
.
kv_lora_rank
,
tile_scheduler_metadata
=
tile_scheduler_metadata
,
num_splits
=
num_splits
,
softmax_scale
=
self
.
scale
,
causal
=
True
,
descale_q
=
layer
.
_q_scale
.
reshape
(
1
),
descale_k
=
layer
.
_k_scale
.
reshape
(
1
),
)
o
=
reshape_attn_output_for_spec_decode
(
o
)
o
=
reshape_attn_output_for_spec_decode
(
o
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment