Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
787c2557
"vscode:/vscode.git/clone" did not exist on "15b1511a15dfb1d56048847da755213632c07b29"
Commit
787c2557
authored
Sep 10, 2025
by
zhuwenwen
Browse files
support cascade_attention
parent
533af8ef
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
100 additions
and
24 deletions
+100
-24
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+100
-24
No files found.
vllm/v1/attention/backends/flash_attn.py
View file @
787c2557
...
@@ -655,6 +655,7 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -655,6 +655,7 @@ class FlashAttentionImpl(AttentionImpl):
return
output
return
output
# Cascade attention (rare case).
# Cascade attention (rare case).
if
not
current_platform
.
is_rocm
():
cascade_attention
(
cascade_attention
(
output
[:
num_actual_tokens
],
output
[:
num_actual_tokens
],
query
[:
num_actual_tokens
],
query
[:
num_actual_tokens
],
...
@@ -679,6 +680,31 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -679,6 +680,31 @@ class FlashAttentionImpl(AttentionImpl):
k_descale
=
layer
.
_k_scale
,
k_descale
=
layer
.
_k_scale
,
v_descale
=
layer
.
_v_scale
,
v_descale
=
layer
.
_v_scale
,
)
)
else
:
cascade_attention
(
output
[:
num_actual_tokens
],
query
[:
num_actual_tokens
],
key_cache
,
value_cache
,
cu_query_lens
=
attn_metadata
.
query_start_loc
,
max_query_len
=
attn_metadata
.
max_query_len
,
cu_prefix_query_lens
=
attn_metadata
.
cu_prefix_query_lens
,
prefix_kv_lens
=
attn_metadata
.
prefix_kv_lens
,
suffix_kv_lens
=
attn_metadata
.
suffix_kv_lens
,
max_kv_len
=
attn_metadata
.
max_seq_len
,
softmax_scale
=
self
.
scale
,
alibi_slopes
=
self
.
alibi_slopes
,
sliding_window
=
self
.
sliding_window
,
logits_soft_cap
=
self
.
logits_soft_cap
,
block_table
=
attn_metadata
.
block_table
,
common_prefix_len
=
attn_metadata
.
common_prefix_len
,
fa_version
=
2
,
#self.vllm_flash_attn_version,
prefix_scheduler_metadata
=
attn_metadata
.
prefix_scheduler_metadata
,
suffix_scheduler_metadata
=
attn_metadata
.
scheduler_metadata
,
# q_descale=layer._q_scale,
# k_descale=layer._k_scale,
# v_descale=layer._v_scale,
)
return
output
return
output
def
_forward_encoder_attention
(
def
_forward_encoder_attention
(
...
@@ -869,6 +895,31 @@ def cascade_attention(
...
@@ -869,6 +895,31 @@ def cascade_attention(
v_descale
=
v_descale
.
expand
(
descale_shape
)
v_descale
=
v_descale
.
expand
(
descale_shape
)
if
v_descale
is
not
None
else
None
,
if
v_descale
is
not
None
else
None
,
)
)
else
:
prefix_output
,
prefix_lse
,
_
=
vllm_flash_attn_varlen_func
(
q
=
query
,
k
=
key_cache
,
v
=
value_cache
,
cu_seqlens_q
=
cu_prefix_query_lens
,
seqused_k
=
prefix_kv_lens
,
max_seqlen_q
=
num_tokens
,
max_seqlen_k
=
common_prefix_len
,
softmax_scale
=
softmax_scale
,
causal
=
False
,
window_size
=
sliding_window
,
block_table
=
block_table
[:
1
],
softcap
=
logits_soft_cap
,
return_softmax_lse
=
True
,
scheduler_metadata
=
prefix_scheduler_metadata
,
# fa_version=fa_version,
# q_descale=q_descale.expand(descale_shape)
# if q_descale is not None else None,
# k_descale=k_descale.expand(descale_shape)
# if k_descale is not None else None,
# v_descale=v_descale.expand(descale_shape)
# if v_descale is not None else None,
is_prefix_cache
=
True
,
)
descale_shape
=
(
cu_query_lens
.
shape
[
0
]
-
1
,
key_cache
.
shape
[
-
2
])
descale_shape
=
(
cu_query_lens
.
shape
[
0
]
-
1
,
key_cache
.
shape
[
-
2
])
...
@@ -897,6 +948,31 @@ def cascade_attention(
...
@@ -897,6 +948,31 @@ def cascade_attention(
v_descale
=
v_descale
.
expand
(
descale_shape
)
v_descale
=
v_descale
.
expand
(
descale_shape
)
if
v_descale
is
not
None
else
None
,
if
v_descale
is
not
None
else
None
,
)
)
else
:
suffix_output
,
suffix_lse
,
_
=
vllm_flash_attn_varlen_func
(
q
=
query
,
k
=
key_cache
,
v
=
value_cache
,
cu_seqlens_q
=
cu_query_lens
,
seqused_k
=
suffix_kv_lens
,
max_seqlen_q
=
max_query_len
,
max_seqlen_k
=
max_kv_len
-
common_prefix_len
,
softmax_scale
=
softmax_scale
,
causal
=
True
,
window_size
=
sliding_window
,
block_table
=
block_table
[:,
num_common_kv_blocks
:],
softcap
=
logits_soft_cap
,
return_softmax_lse
=
True
,
scheduler_metadata
=
suffix_scheduler_metadata
,
# fa_version=fa_version,
# q_descale=q_descale.expand(descale_shape)
# if q_descale is not None else None,
# k_descale=k_descale.expand(descale_shape)
# if k_descale is not None else None,
# v_descale=v_descale.expand(descale_shape)
# if v_descale is not None else None,
is_prefix_cache
=
True
,
)
# Merge prefix and suffix outputs, and store the result in output.
# Merge prefix and suffix outputs, and store the result in output.
merge_attn_states
(
output
,
prefix_output
,
prefix_lse
,
suffix_output
,
merge_attn_states
(
output
,
prefix_output
,
prefix_lse
,
suffix_output
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment