[V0 deprecation] Clean up num_prefill_tokens logic for V0 (#28203)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>

[V0 deprecation] Clean up num_prefill_tokens logic for V0 (#28203)
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
de120bc9 · Canlin Guo · GitHub · 4228be79 · de120bc9
Unverified Commit de120bc9 authored Nov 12, 2025 by Canlin Guo Committed by GitHub Nov 11, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 15 deletions

vllm/forward_context.py vllm/forward_context.py +3 -15

No files found.
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,7 +5,7 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Union
+from typing import TYPE_CHECKING, Any, NamedTuple

 import torch

@@ -185,18 +185,13 @@ class ForwardContext:
    # copy from vllm_config.compilation_config.static_forward_context
    no_compile_layers: dict[str, Any]
    """
-    Type AttentionMetadata for v0, 
    Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
    attention layer to its attention metadata
    Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
    for each microbatch.
    Set dynamically for each forward pass
    """
-    attn_metadata: Union[
-        "AttentionMetadata",
-        dict[str, "AttentionMetadata"],
-        list[dict[str, "AttentionMetadata"]],
-    ]
+    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
    # TODO: remove after making all virtual_engines share the same kv cache
    virtual_engine: int  # set dynamically for each forward pass
    # set dynamically for each forward pass
@@ -324,13 +319,6 @@ def set_forward_context(
    finally:
        global last_logging_time, batchsize_logging_interval
        if need_to_track_batchsize:
-            if hasattr(attn_metadata, "num_prefill_tokens"):
-                # for v0 attention backends
-                batchsize = (
-                    attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
-                )
-            else:
-                # for v1 attention backends
            batchsize = num_tokens
            # we use synchronous scheduling right now,
            # adding a sync point here should not affect