Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
de120bc9
Unverified
Commit
de120bc9
authored
Nov 12, 2025
by
Canlin Guo
Committed by
GitHub
Nov 11, 2025
Browse files
[V0 deprecation] Clean up num_prefill_tokens logic for V0 (#28203)
Signed-off-by:
gcanlin
<
canlinguosdu@gmail.com
>
parent
4228be79
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
3 additions
and
15 deletions
+3
-15
vllm/forward_context.py
vllm/forward_context.py
+3
-15
No files found.
vllm/forward_context.py
View file @
de120bc9
...
...
@@ -5,7 +5,7 @@ import time
from
collections
import
defaultdict
from
contextlib
import
contextmanager
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
NamedTuple
,
Union
from
typing
import
TYPE_CHECKING
,
Any
,
NamedTuple
import
torch
...
...
@@ -185,18 +185,13 @@ class ForwardContext:
# copy from vllm_config.compilation_config.static_forward_context
no_compile_layers
:
dict
[
str
,
Any
]
"""
Type AttentionMetadata for v0,
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata
Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
for each microbatch.
Set dynamically for each forward pass
"""
attn_metadata
:
Union
[
"AttentionMetadata"
,
dict
[
str
,
"AttentionMetadata"
],
list
[
dict
[
str
,
"AttentionMetadata"
]],
]
attn_metadata
:
dict
[
str
,
"AttentionMetadata"
]
|
list
[
dict
[
str
,
"AttentionMetadata"
]]
# TODO: remove after making all virtual_engines share the same kv cache
virtual_engine
:
int
# set dynamically for each forward pass
# set dynamically for each forward pass
...
...
@@ -324,13 +319,6 @@ def set_forward_context(
finally
:
global
last_logging_time
,
batchsize_logging_interval
if
need_to_track_batchsize
:
if
hasattr
(
attn_metadata
,
"num_prefill_tokens"
):
# for v0 attention backends
batchsize
=
(
attn_metadata
.
num_prefill_tokens
+
attn_metadata
.
num_decode_tokens
)
else
:
# for v1 attention backends
batchsize
=
num_tokens
# we use synchronous scheduling right now,
# adding a sync point here should not affect
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment