Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
de3869bb
Unverified
Commit
de3869bb
authored
Feb 07, 2026
by
Rohan Potdar
Committed by
GitHub
Feb 07, 2026
Browse files
move checks out of `unified_kv_cache_update` custom op (#33943)
Signed-off-by:
Rohan138
<
rohanpotdar138@gmail.com
>
parent
ce9b3cd3
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
80 additions
and
101 deletions
+80
-101
vllm/model_executor/layers/attention/attention.py
vllm/model_executor/layers/attention/attention.py
+14
-6
vllm/model_executor/layers/attention/cross_attention.py
vllm/model_executor/layers/attention/cross_attention.py
+3
-0
vllm/model_executor/models/whisper_causal.py
vllm/model_executor/models/whisper_causal.py
+3
-0
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+0
-10
vllm/v1/attention/backends/rocm_aiter_unified_attn.py
vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+11
-20
vllm/v1/attention/backends/rocm_attn.py
vllm/v1/attention/backends/rocm_attn.py
+32
-42
vllm/v1/attention/backends/triton_attn.py
vllm/v1/attention/backends/triton_attn.py
+17
-23
No files found.
vllm/model_executor/layers/attention/attention.py
View file @
de3869bb
...
...
@@ -422,9 +422,15 @@ class Attention(nn.Module, AttentionLayerBase):
key
=
key
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
if
value
is
not
None
:
value
=
value
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size_v
)
if
self
.
use_direct_call
:
kv_cache_dummy_dep
=
None
if
not
self
.
attn_backend
.
forward_includes_kv_cache_update
:
if
self
.
use_direct_call
:
# Skip this if sharing KV cache with an earlier attention layer.
if
(
not
self
.
attn_backend
.
forward_includes_kv_cache_update
and
self
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
kv_cache_dummy_dep
=
unified_kv_cache_update
(
key
,
value
,
self
.
layer_name
)
...
...
@@ -437,10 +443,12 @@ class Attention(nn.Module, AttentionLayerBase):
kv_cache_dummy_dep
=
kv_cache_dummy_dep
,
)
else
:
kv_cache_dummy_dep
=
None
if
not
self
.
attn_backend
.
forward_includes_kv_cache_update
and
(
# torch can only dispatch custom op if a tensor is passed
key
is
not
None
or
value
is
not
None
# Skip this if sharing KV cache with an earlier attention layer.
if
(
not
self
.
attn_backend
.
forward_includes_kv_cache_update
and
self
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
kv_cache_dummy_dep
=
torch
.
ops
.
vllm
.
unified_kv_cache_update
(
key
,
value
,
self
.
layer_name
...
...
vllm/model_executor/layers/attention/cross_attention.py
View file @
de3869bb
...
...
@@ -136,6 +136,9 @@ def create_cross_attention_backend(
if
(
not
underlying_attn_backend
.
forward_includes_kv_cache_update
and
attn_metadata
is
not
None
and
layer
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
self
.
do_kv_cache_update
(
layer
,
key
,
value
,
kv_cache
,
attn_metadata
.
slot_mapping
...
...
vllm/model_executor/models/whisper_causal.py
View file @
de3869bb
...
...
@@ -172,6 +172,9 @@ def create_whisper_attention_backend_with_block_pooling(
if
(
not
underlying_attn_backend
.
forward_includes_kv_cache_update
and
attn_metadata
is
not
None
and
layer
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
self
.
do_kv_cache_update
(
layer
,
key
,
value
,
kv_cache
,
attn_metadata
.
slot_mapping
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
de3869bb
...
...
@@ -771,16 +771,6 @@ class FlashAttentionImpl(AttentionImpl):
# we use direct Q, K, V tensors without caching
return
# key and value may be None in the case of cross attention. They are
# calculated once based on the output from the encoder and then cached
# in KV cache.
if
(
self
.
kv_sharing_target_layer_name
is
not
None
or
key
is
None
or
value
is
None
):
return
key_cache
,
value_cache
=
kv_cache
.
unbind
(
0
)
# Reshape the input keys and values and store them in the cache.
...
...
vllm/v1/attention/backends/rocm_aiter_unified_attn.py
View file @
de3869bb
...
...
@@ -196,16 +196,7 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
):
key_cache
,
value_cache
=
kv_cache
.
unbind
(
0
)
# key and value may be None in the case of cross attention. They are
# calculated once based on the output from the encoder and then cached
# in KV cache.
if
(
self
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
# Reshape the input keys and values and store them in the cache.
# Skip this if sharing KV cache with an earlier attention layer.
ops
.
reshape_and_cache_flash
(
key
,
value
,
...
...
vllm/v1/attention/backends/rocm_attn.py
View file @
de3869bb
...
...
@@ -383,17 +383,7 @@ class RocmAttentionImpl(AttentionImpl):
kv_cache
,
self
.
num_kv_heads
,
self
.
head_size
)
# key and value may be None in the case of cross attention. They are
# calculated once based on the output from the encoder and then cached
# in KV cache.
if
(
self
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
# Reshape the input keys and values and store them in the cache.
# Skip this if sharing KV cache with an earlier attention layer.
# Get the actual block_size from value_cache
# value_cache shape: [num_blocks, num_heads, head_size, block_size]
block_size
=
value_cache
.
shape
[
3
]
...
...
vllm/v1/attention/backends/triton_attn.py
View file @
de3869bb
...
...
@@ -579,13 +579,7 @@ class TritonAttentionImpl(AttentionImpl):
# For decoder and cross-attention, use KV cache as before
key_cache
,
value_cache
=
kv_cache
.
unbind
(
1
)
if
(
self
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
# Reshape the input keys and values and store them in the cache.
# Skip this if sharing KV cache with an earlier attention layer.
if
self
.
kv_cache_dtype
.
startswith
(
"fp8"
):
key_cache
=
key_cache
.
view
(
self
.
fp8_dtype
)
value_cache
=
value_cache
.
view
(
self
.
fp8_dtype
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment