[BugFix] Re-fix async multimodal cpu tensor race condition (#31373)

Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: njhill <nickhill123@gmail.com>

[BugFix] Re-fix async multimodal cpu tensor race condition (#31373)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: njhill <nickhill123@gmail.com>
094fcce2 · Nick Hill · GitHub · 573dd0e6 · 094fcce2
Unverified Commit 094fcce2 authored Dec 28, 2025 by Nick Hill Committed by GitHub Dec 28, 2025
Show whitespace changes
Inline Side-by-side

Showing with 112 additions and 114 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +112 -114

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3058,8 +3058,10 @@ class GPUModelRunner(
            scheduler_output = deepcopy(scheduler_output)

        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        with record_function_or_nullcontext("gpu_model_runner: preprocess"):
-            with self.synchronize_input_prep():
+        with (
+            record_function_or_nullcontext("gpu_model_runner: preprocess"),
+            self.synchronize_input_prep(),
+        ):
            # Update persistent batch states.
            self._update_states(scheduler_output)

@@ -3087,9 +3089,8 @@ class GPUModelRunner(
                if not has_kv_transfer_group():
                    # Return empty ModelRunnerOutput if no work to do.
                    return EMPTY_MODEL_RUNNER_OUTPUT
-                    return self.kv_connector_no_forward(
-                        scheduler_output, self.vllm_config
-                    )
+                return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
+
            if self.cache_config.kv_sharing_fast_prefill:
                assert not self.num_prompt_logprobs, (
                    "--kv-sharing-fast-prefill produces incorrect "
@@ -3104,10 +3105,7 @@ class GPUModelRunner(
            max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
            num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens

-                (
-                    logits_indices,
-                    spec_decode_metadata,
-                ) = self._prepare_inputs(
+            logits_indices, spec_decode_metadata = self._prepare_inputs(
                scheduler_output,
                num_scheduled_tokens_np,
            )
@@ -3169,7 +3167,7 @@ class GPUModelRunner(
            use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
            ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices

-                (attn_metadata, spec_decode_common_attn_metadata) = (
+            attn_metadata, spec_decode_common_attn_metadata = (
                self._build_attention_metadata(
                    num_tokens=num_tokens_unpadded,
                    num_tokens_padded=num_tokens_padded if pad_attn else None,