[BUGFIX] Fix `_dummy_run` missing `prepare_inputs_event` synchronization (#34866)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>

[BUGFIX] Fix `_dummy_run` missing `prepare_inputs_event` synchronization (#34866)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
59965aff · Vadim Gimpelson · GitHub · b1c4f0b2 · 59965aff
Unverified Commit 59965aff authored Feb 20, 2026 by Vadim Gimpelson Committed by GitHub Feb 20, 2026
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 26 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +31 -26

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4771,8 +4771,13 @@ class GPUModelRunner(
            ubatch_slices=ubatch_slices_padded,
        )

-        # If force_attention is True, we always capture attention. Otherwise,
-        # it only happens for cudagraph_runtime_mode=FULL.
+        # _dummy_run shares pinned CPU buffers (seq_lens, query_start_loc,
+        # etc.) with execute_model.  It must participate in the same event
+        # protocol so that back-to-back dummy/real steps don't overwrite
+        # pinned memory while a prior non_blocking H2D DMA is still reading.
+        with self.synchronize_input_prep():
+            # If force_attention is True, we always capture attention.
+            # Otherwise, it only happens for cudagraph_runtime_mode=FULL.
            if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
                if create_mixed_batch:
                    # In the mixed batch mode (used for FI warmup), we use
@@ -4795,7 +4800,7 @@ class GPUModelRunner(
                    num_tokens_padded=num_tokens_padded if pad_attn else None,
                    num_reqs=num_reqs_padded,
                    max_query_len=max_query_len,
-                ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices,
+                    ubatch_slices=(ubatch_slices_padded if pad_attn else ubatch_slices),
                    for_cudagraph_capture=is_graph_capturing,
                    slot_mappings=slot_mappings_by_group,
                )