Unverified Commit ff4810ba authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Minor] Group async_scheduling related fields in model runner init (#26736)


Signed-off-by: default avatarNick Hill <nhill@redhat.com>
parent 9d696492
......@@ -375,9 +375,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
self.use_async_scheduling = self.scheduler_config.async_scheduling
self.async_output_copy_stream = (
torch.cuda.Stream() if self.use_async_scheduling else None
)
# Separate cuda stream for overlapping transfer of sampled token ids from
# GPU to CPU when async scheduling is enabled.
self.async_output_copy_stream: torch.cuda.Stream | None = None
# cuda event to synchronize use of reused CPU tensors between steps
# when async scheduling is enabled.
self.prepare_inputs_event: torch.cuda.Event | None = None
if self.use_async_scheduling:
self.async_output_copy_stream = torch.cuda.Stream()
self.prepare_inputs_event = torch.cuda.Event()
# TODO(woosuk): Provide an option to tune the max cudagraph batch size.
# The convention is different.
......@@ -444,14 +450,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
(3, self.max_num_tokens + 1), dtype=torch.int64
)
# CUDA event to synchronize use of reused CPU tensors between steps
# when async scheduling is enabled.
self.prepare_inputs_event: torch.cuda.Event | None = None
if self.use_async_scheduling:
self.prepare_inputs_event = torch.cuda.Event()
# Start in a completed state.
self.prepare_inputs_event.record(torch.cuda.default_stream())
# None in the first PP rank. The rest are set after load_model.
self.intermediate_tensors: IntermediateTensors | None = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment