[Model Runner V2] Use block table apis for capture inputs (#35671)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>

[Model Runner V2] Use block table apis for capture inputs (#35671)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
72f4d162 · Woosuk Kwon · GitHub · 5a435507 · 72f4d162 · 72f4d162
Unverified Commit 72f4d162 authored Mar 01, 2026 by Woosuk Kwon Committed by GitHub Mar 01, 2026
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 2 deletions

vllm/v1/worker/gpu/block_table.py vllm/v1/worker/gpu/block_table.py +11 -0

vllm/v1/worker/gpu/cudagraph_utils.py vllm/v1/worker/gpu/cudagraph_utils.py +2 -2

No files found.
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -119,6 +119,10 @@ class BlockTables:
        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)

    def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
+        # NOTE(woosuk): The output may be used for CUDA graph capture.
+        # Therefore, this method must return the persistent tensor
+        # with the same memory address as that used during the model's forward pass,
+        # rather than allocating a new tensor.
        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)

    def compute_slot_mappings(
@@ -150,7 +154,14 @@ class BlockTables:
        return self.slot_mappings[:, :num_tokens]

    def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
+        # Fill the entire slot_mappings tensor, not just the first `num_tokens` entries.
+        # This is because the padding logic is complex and kernels may access beyond
+        # the requested range.
        self.slot_mappings.fill_(PAD_SLOT_ID)
+        # NOTE(woosuk): The output may be used for CUDA graph capture.
+        # Therefore, this method must return the persistent tensor
+        # with the same memory address as that used during the model's forward pass,
+        # rather than allocating a new tensor.
        return self.slot_mappings[:, :num_tokens]



--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -420,8 +420,8 @@ def prepare_inputs_to_capture(
    input_buffers.dcp_local_seq_lens[:num_reqs] = num_tokens
    input_buffers.dcp_local_seq_lens[num_reqs:] = 0

-    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
-    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
+    input_block_tables = block_tables.get_dummy_block_tables(num_reqs)
+    slot_mappings = block_tables.get_dummy_slot_mappings(num_tokens)
    slot_mappings_by_layer = build_slot_mappings_by_layer(
        slot_mappings, kv_cache_config
    )