Unverified Commit 62095e82 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[BugFix][MRV2] Fix cuda event reuse race (#39115)


Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
parent b2b2c523
...@@ -17,7 +17,6 @@ class AsyncOutput(AsyncModelRunnerOutput): ...@@ -17,7 +17,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
num_sampled_tokens: torch.Tensor, num_sampled_tokens: torch.Tensor,
main_stream: torch.cuda.Stream, main_stream: torch.cuda.Stream,
copy_stream: torch.cuda.Stream, copy_stream: torch.cuda.Stream,
copy_event: torch.cuda.Event,
): ):
# NOTE(woosuk): We must retain references to the GPU tensors, # NOTE(woosuk): We must retain references to the GPU tensors,
# as the copy operations are performed on a different CUDA stream than # as the copy operations are performed on a different CUDA stream than
...@@ -25,7 +24,7 @@ class AsyncOutput(AsyncModelRunnerOutput): ...@@ -25,7 +24,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
self.model_runner_output = model_runner_output self.model_runner_output = model_runner_output
self.sampler_output = sampler_output self.sampler_output = sampler_output
self.num_sampled_tokens = num_sampled_tokens self.num_sampled_tokens = num_sampled_tokens
self.copy_event = copy_event self.copy_event = torch.cuda.Event()
with stream(copy_stream, main_stream): with stream(copy_stream, main_stream):
copy_stream.wait_stream(main_stream) copy_stream.wait_stream(main_stream)
...@@ -78,12 +77,11 @@ class AsyncPoolingOutput(AsyncModelRunnerOutput): ...@@ -78,12 +77,11 @@ class AsyncPoolingOutput(AsyncModelRunnerOutput):
is_valid: torch.Tensor | None, is_valid: torch.Tensor | None,
main_stream: torch.cuda.Stream, main_stream: torch.cuda.Stream,
copy_stream: torch.cuda.Stream, copy_stream: torch.cuda.Stream,
copy_event: torch.cuda.Event,
): ):
self.model_runner_output = model_runner_output self.model_runner_output = model_runner_output
self.pooler_output = pooler_output self.pooler_output = pooler_output
self.is_valid = is_valid self.is_valid = is_valid
self.copy_event = copy_event self.copy_event = torch.cuda.Event()
with stream(copy_stream, main_stream): with stream(copy_stream, main_stream):
copy_stream.wait_stream(main_stream) copy_stream.wait_stream(main_stream)
......
...@@ -130,7 +130,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -130,7 +130,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.use_async_scheduling = self.scheduler_config.async_scheduling self.use_async_scheduling = self.scheduler_config.async_scheduling
self.output_copy_stream = torch.cuda.Stream(self.device) self.output_copy_stream = torch.cuda.Stream(self.device)
self.output_copy_event = torch.cuda.Event()
# Pipeline parallelism. # Pipeline parallelism.
self.use_pp = self.parallel_config.pipeline_parallel_size > 1 self.use_pp = self.parallel_config.pipeline_parallel_size > 1
...@@ -1180,7 +1179,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1180,7 +1179,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_sampled_tokens=num_sampled, num_sampled_tokens=num_sampled,
main_stream=self.main_stream, main_stream=self.main_stream,
copy_stream=self.output_copy_stream, copy_stream=self.output_copy_stream,
copy_event=self.output_copy_event,
) )
mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None
...@@ -1270,7 +1268,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1270,7 +1268,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
is_valid=is_valid, is_valid=is_valid,
main_stream=self.main_stream, main_stream=self.main_stream,
copy_stream=self.output_copy_stream, copy_stream=self.output_copy_stream,
copy_event=self.output_copy_event,
) )
self.postprocess_pool(input_batch) self.postprocess_pool(input_batch)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment