Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c59a132f
Unverified
Commit
c59a132f
authored
Mar 23, 2026
by
Wentao Ye
Committed by
GitHub
Mar 23, 2026
Browse files
[V0 Deprecation] Refactor kv cache from list to element (#37487)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
de99d91e
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
15 additions
and
17 deletions
+15
-17
vllm/model_executor/models/olmo_hybrid.py
vllm/model_executor/models/olmo_hybrid.py
+1
-1
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo2.py
+1
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+2
-2
vllm/v1/attention/backends/rocm_aiter_fa.py
vllm/v1/attention/backends/rocm_aiter_fa.py
+3
-7
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+4
-1
vllm/v1/worker/mamba_utils.py
vllm/v1/worker/mamba_utils.py
+1
-1
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+3
-4
No files found.
vllm/model_executor/models/olmo_hybrid.py
View file @
c59a132f
...
...
@@ -428,7 +428,7 @@ class OlmoHybridGatedDeltaNet(nn.Module, MambaBase):
non_spec_token_indx
=
attn_metadata
.
non_spec_token_indx
spec_state_indices_tensor
=
attn_metadata
.
spec_state_indices_tensor
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
self_kv_cache
=
self
.
kv_cache
[
0
]
self_kv_cache
=
self
.
kv_cache
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
...
...
vllm/model_executor/models/plamo2.py
View file @
c59a132f
...
...
@@ -262,7 +262,7 @@ class Plamo2MambaMixer(MambaBase, PluggableLayer):
assert
isinstance
(
attn_metadata
,
dict
)
attn_metadata
=
attn_metadata
[
self
.
prefix
]
assert
isinstance
(
attn_metadata
,
Mamba2AttentionMetadata
)
self_kv_cache
=
self
.
kv_cache
[
0
]
self_kv_cache
=
self
.
kv_cache
# conv_state = (..., dim, width-1) yet contiguous along 'dim'
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
...
...
vllm/model_executor/models/qwen3_next.py
View file @
c59a132f
...
...
@@ -858,7 +858,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
non_spec_token_indx
=
attn_metadata
.
non_spec_token_indx
spec_state_indices_tensor
=
attn_metadata
.
spec_state_indices_tensor
# noqa: E501
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
# noqa: E501
self_kv_cache
=
self
.
kv_cache
[
0
]
self_kv_cache
=
self
.
kv_cache
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
...
...
@@ -1046,7 +1046,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
Core attention computation with a packed non-spec decode fast path.
"""
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
# noqa: E501
self_kv_cache
=
self
.
kv_cache
[
0
]
self_kv_cache
=
self
.
kv_cache
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
...
...
vllm/v1/attention/backends/rocm_aiter_fa.py
View file @
c59a132f
...
...
@@ -481,13 +481,9 @@ class AiterFlashAttentionMetadataBuilder(
):
layers
=
get_layers_from_vllm_config
(
self
.
vllm_config
,
Attention
)
first_layer_name
=
[
k
for
k
in
layers
][
0
]
kv_cache_shape
=
(
self
.
vllm_config
.
compilation_config
.
static_forward_context
[
kv_cache_shape
=
self
.
vllm_config
.
compilation_config
.
static_forward_context
[
first_layer_name
]
.
kv_cache
[
0
]
.
shape
)
].
kv_cache
.
shape
num_blocks
=
kv_cache_shape
[
1
]
self
.
scale
=
torch
.
ones
(
[
num_blocks
,
self
.
num_heads_kv
,
self
.
block_size
],
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
c59a132f
...
...
@@ -5830,7 +5830,10 @@ class GPUModelRunner(
for
layer
in
self
.
compilation_config
.
static_forward_context
.
values
():
if
hasattr
(
layer
,
"kv_cache"
):
layer
.
kv_cache
=
[]
kv_cache
=
layer
.
kv_cache
layer
.
kv_cache
=
(
torch
.
tensor
([])
if
isinstance
(
kv_cache
,
torch
.
Tensor
)
else
[]
)
gc
.
collect
()
torch
.
accelerator
.
empty_cache
()
...
...
vllm/v1/worker/mamba_utils.py
View file @
c59a132f
...
...
@@ -119,7 +119,7 @@ def collect_mamba_copy_meta(
layer_names
=
kv_cache_config
.
kv_cache_groups
[
mamba_group_id
].
layer_names
for
layer_name
in
layer_names
:
attention
=
forward_context
[
layer_name
]
kv_caches
:
list
[
torch
.
Tensor
]
=
attention
.
kv_cache
[
0
]
kv_caches
:
list
[
torch
.
Tensor
]
=
attention
.
kv_cache
for
state
,
state_copy_func
in
zip
(
kv_caches
,
mamba_state_copy_funcs
):
copy_spec
=
state_copy_func
(
state
,
block_ids
,
src_block_idx
,
accept_token_bias
+
1
...
...
vllm/v1/worker/utils.py
View file @
c59a132f
...
...
@@ -136,8 +136,8 @@ class KVBlockZeroer:
for
layer_name
in
group
.
layer_names
:
if
layer_name
in
runner_only_attn_layers
:
continue
kv
=
static_forward_context
[
layer_name
].
kv_cache
[
0
]
if
isinstance
(
kv
,
list
):
kv
=
static_forward_context
[
layer_name
].
kv_cache
if
not
isinstance
(
kv
,
torch
.
Tensor
):
continue
dp
=
kv
.
data_ptr
()
if
dp
in
seen_ptrs
:
...
...
@@ -510,8 +510,7 @@ def bind_kv_cache(
# Bind kv_caches to forward context
for
layer_name
,
kv_cache
in
kv_caches
.
items
():
# NOTE: Keep list wrapper for layers that index kv_cache by engine slot.
forward_context
[
layer_name
].
kv_cache
=
[
kv_cache
]
forward_context
[
layer_name
].
kv_cache
=
kv_cache
def
is_residual_scattered_for_sp
(
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment