Commit 99863602 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-maxiao2' into 'v0.9.2-dev'

fix pd send async perfomance

See merge request dcutoolkit/deeplearing/vllm!224
parents 9b491fbd a92daffa
...@@ -107,6 +107,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -107,6 +107,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.speculative_config = vllm_config.speculative_config self.speculative_config = vllm_config.speculative_config
self.prompt_adapter_config = vllm_config.prompt_adapter_config self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config self.observability_config = vllm_config.observability_config
if envs.VLLM_P2P_ASYNC:
self.p2p_event = torch.cuda.Event(enable_timing=False)
self.p2p_stream = torch.cuda.Stream()
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
set_cpu_offload_max_bytes( set_cpu_offload_max_bytes(
...@@ -1295,7 +1298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1295,7 +1298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
) -> Union[ModelRunnerOutput, IntermediateTensors]: ) -> Union[ModelRunnerOutput, IntermediateTensors]:
# profile.StartTracer()
self._update_states(scheduler_output) self._update_states(scheduler_output)
if not scheduler_output.total_num_scheduled_tokens: if not scheduler_output.total_num_scheduled_tokens:
if not has_kv_transfer_group(): if not has_kv_transfer_group():
...@@ -1381,6 +1384,34 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1381,6 +1384,34 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_tokens_across_dp, input_ids, positions, num_tokens_across_dp, input_ids, positions,
inputs_embeds, scheduler_output, intermediate_tensors, inputs_embeds, scheduler_output, intermediate_tensors,
skip_cuda_graphs) skip_cuda_graphs)
elif envs.VLLM_P2P_ASYNC:
self.p2p_event.record()
current_stream = torch.cuda.current_stream()
with torch.cuda.stream(self.p2p_stream):
self.p2p_stream.wait_event(self.p2p_event)
with set_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_input_tokens,
num_tokens_across_dp=num_tokens_across_dp,
skip_cuda_graphs=skip_cuda_graphs,
):
self.maybe_setup_kv_connector(scheduler_output)
model_output = self.model(
input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
self.maybe_wait_for_kv_save()
finished_sending, finished_recving = (
self.get_finished_kv_transfers(scheduler_output))
self.p2p_event.record()
current_stream.wait_event(self.p2p_event)
else: else:
# Run the model. # Run the model.
# Use persistent buffers for CUDA graphs. # Use persistent buffers for CUDA graphs.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment