Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0d81a1fe
Unverified
Commit
0d81a1fe
authored
Mar 18, 2026
by
Wentao Ye
Committed by
GitHub
Mar 18, 2026
Browse files
[V0 Deprecation] Deprecate virtual engine (#37195)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
6ae4c8d6
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
6 deletions
+4
-6
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo2.py
+1
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+2
-4
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+1
-1
No files found.
vllm/model_executor/models/plamo2.py
View file @
0d81a1fe
...
@@ -262,7 +262,7 @@ class Plamo2MambaMixer(MambaBase, PluggableLayer):
...
@@ -262,7 +262,7 @@ class Plamo2MambaMixer(MambaBase, PluggableLayer):
assert
isinstance
(
attn_metadata
,
dict
)
assert
isinstance
(
attn_metadata
,
dict
)
attn_metadata
=
attn_metadata
[
self
.
prefix
]
attn_metadata
=
attn_metadata
[
self
.
prefix
]
assert
isinstance
(
attn_metadata
,
Mamba2AttentionMetadata
)
assert
isinstance
(
attn_metadata
,
Mamba2AttentionMetadata
)
self_kv_cache
=
self
.
kv_cache
[
forward_context
.
virtual_engine
]
self_kv_cache
=
self
.
kv_cache
[
0
]
# conv_state = (..., dim, width-1) yet contiguous along 'dim'
# conv_state = (..., dim, width-1) yet contiguous along 'dim'
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
ssm_state
=
self_kv_cache
[
1
]
...
...
vllm/model_executor/models/qwen3_next.py
View file @
0d81a1fe
...
@@ -842,7 +842,6 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
...
@@ -842,7 +842,6 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
a
=
a
,
a
=
a
,
core_attn_out
=
core_attn_out
,
core_attn_out
=
core_attn_out
,
attn_metadata
=
attn_metadata
,
attn_metadata
=
attn_metadata
,
virtual_engine
=
forward_context
.
virtual_engine
,
)
)
has_initial_state
=
attn_metadata
.
has_initial_state
has_initial_state
=
attn_metadata
.
has_initial_state
...
@@ -853,7 +852,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
...
@@ -853,7 +852,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
non_spec_token_indx
=
attn_metadata
.
non_spec_token_indx
non_spec_token_indx
=
attn_metadata
.
non_spec_token_indx
spec_state_indices_tensor
=
attn_metadata
.
spec_state_indices_tensor
# noqa: E501
spec_state_indices_tensor
=
attn_metadata
.
spec_state_indices_tensor
# noqa: E501
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
# noqa: E501
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
# noqa: E501
self_kv_cache
=
self
.
kv_cache
[
forward_context
.
virtual_engine
]
self_kv_cache
=
self
.
kv_cache
[
0
]
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
ssm_state
=
self_kv_cache
[
1
]
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
...
@@ -1036,13 +1035,12 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
...
@@ -1036,13 +1035,12 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
a
:
torch
.
Tensor
,
a
:
torch
.
Tensor
,
core_attn_out
:
torch
.
Tensor
,
core_attn_out
:
torch
.
Tensor
,
attn_metadata
:
GDNAttentionMetadata
,
attn_metadata
:
GDNAttentionMetadata
,
virtual_engine
:
int
,
):
):
"""
"""
Core attention computation with a packed non-spec decode fast path.
Core attention computation with a packed non-spec decode fast path.
"""
"""
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
# noqa: E501
non_spec_state_indices_tensor
=
attn_metadata
.
non_spec_state_indices_tensor
# noqa: E501
self_kv_cache
=
self
.
kv_cache
[
virtual_engine
]
self_kv_cache
=
self
.
kv_cache
[
0
]
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
conv_state
=
self_kv_cache
[
0
].
transpose
(
-
1
,
-
2
)
ssm_state
=
self_kv_cache
[
1
]
ssm_state
=
self_kv_cache
[
1
]
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
...
...
vllm/v1/worker/utils.py
View file @
0d81a1fe
...
@@ -510,7 +510,7 @@ def bind_kv_cache(
...
@@ -510,7 +510,7 @@ def bind_kv_cache(
# Bind kv_caches to forward context
# Bind kv_caches to forward context
for
layer_name
,
kv_cache
in
kv_caches
.
items
():
for
layer_name
,
kv_cache
in
kv_caches
.
items
():
# NOTE:
Use
list
because of v0 PP virtual
engine.
# NOTE:
Keep
list
wrapper for layers that index kv_cache by
engine
slot
.
forward_context
[
layer_name
].
kv_cache
=
[
kv_cache
]
forward_context
[
layer_name
].
kv_cache
=
[
kv_cache
]
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment