[BugFix][MRV2] Fix cuda event reuse race (#39115)

Signed-off-by: Nick Hill <nickhill123@gmail.com>

[BugFix][MRV2] Fix cuda event reuse race (#39115)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
62095e82 · Nick Hill · GitHub · b2b2c523 · 62095e82 · 62095e82
Unverified Commit 62095e82 authored Apr 06, 2026 by Nick Hill Committed by GitHub Apr 07, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 7 deletions

vllm/v1/worker/gpu/async_utils.py vllm/v1/worker/gpu/async_utils.py +2 -4

vllm/v1/worker/gpu/model_runner.py vllm/v1/worker/gpu/model_runner.py +0 -3

No files found.
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -17,7 +17,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
        num_sampled_tokens: torch.Tensor,
        main_stream: torch.cuda.Stream,
        copy_stream: torch.cuda.Stream,
-        copy_event: torch.cuda.Event,
    ):
        # NOTE(woosuk): We must retain references to the GPU tensors,
        # as the copy operations are performed on a different CUDA stream than
@@ -25,7 +24,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
        self.model_runner_output = model_runner_output
        self.sampler_output = sampler_output
        self.num_sampled_tokens = num_sampled_tokens
-        self.copy_event = copy_event
+        self.copy_event = torch.cuda.Event()
        with stream(copy_stream, main_stream):
            copy_stream.wait_stream(main_stream)
@@ -78,12 +77,11 @@ class AsyncPoolingOutput(AsyncModelRunnerOutput):
        is_valid: torch.Tensor | None,
        main_stream: torch.cuda.Stream,
        copy_stream: torch.cuda.Stream,
-        copy_event: torch.cuda.Event,
    ):
        self.model_runner_output = model_runner_output
        self.pooler_output = pooler_output
        self.is_valid = is_valid
-        self.copy_event = copy_event
+        self.copy_event = torch.cuda.Event()
        with stream(copy_stream, main_stream):
            copy_stream.wait_stream(main_stream)

--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -130,7 +130,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        self.use_async_scheduling = self.scheduler_config.async_scheduling
        self.output_copy_stream = torch.cuda.Stream(self.device)
-        self.output_copy_event = torch.cuda.Event()
        # Pipeline parallelism.
        self.use_pp = self.parallel_config.pipeline_parallel_size > 1
@@ -1180,7 +1179,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            num_sampled_tokens=num_sampled,
            main_stream=self.main_stream,
            copy_stream=self.output_copy_stream,
-            copy_event=self.output_copy_event,
        )
        mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None
@@ -1270,7 +1268,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            is_valid=is_valid,
            main_stream=self.main_stream,
            copy_stream=self.output_copy_stream,
-            copy_event=self.output_copy_event,
        )
        self.postprocess_pool(input_batch)