Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1707bebe
Commit
1707bebe
authored
Oct 11, 2025
by
maxiao1
Browse files
fix pd send async perfomance
parent
e42a922c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
33 additions
and
2 deletions
+33
-2
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+32
-1
No files found.
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
1707bebe
...
@@ -294,7 +294,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
...
@@ -294,7 +294,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
slot_mapping
=
request
.
slot_mapping
slot_mapping
=
request
.
slot_mapping
if
request
.
slot_mapping_device
is
None
:
if
request
.
slot_mapping_device
is
None
:
request
.
slot_mapping_device
=
\
request
.
slot_mapping_device
=
\
request
.
slot_mapping
.
pin_memory
().
to
(
device
=
kv_layer
.
device
,
non_blocking
=
True
)
request
.
slot_mapping
.
pin_memory
().
to
(
device
=
kv_layer
.
device
,
non_blocking
=
True
)
slot_mapping
=
request
.
slot_mapping_device
slot_mapping
=
request
.
slot_mapping_device
kv_cache
=
extract_kv_from_layer
(
kv_layer
,
slot_mapping
)
kv_cache
=
extract_kv_from_layer
(
kv_layer
,
slot_mapping
)
tbo_evt
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
tbo_evt
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
1707bebe
...
@@ -107,6 +107,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -107,6 +107,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
observability_config
=
vllm_config
.
observability_config
if
envs
.
VLLM_P2P_ASYNC
:
self
.
p2p_event
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
self
.
p2p_stream
=
torch
.
cuda
.
Stream
()
from
vllm.model_executor.models.utils
import
set_cpu_offload_max_bytes
from
vllm.model_executor.models.utils
import
set_cpu_offload_max_bytes
set_cpu_offload_max_bytes
(
set_cpu_offload_max_bytes
(
...
@@ -1295,7 +1298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1295,7 +1298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
scheduler_output
:
"SchedulerOutput"
,
scheduler_output
:
"SchedulerOutput"
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
)
->
Union
[
ModelRunnerOutput
,
IntermediateTensors
]:
)
->
Union
[
ModelRunnerOutput
,
IntermediateTensors
]:
profile
.
StartTracer
()
self
.
_update_states
(
scheduler_output
)
self
.
_update_states
(
scheduler_output
)
if
not
scheduler_output
.
total_num_scheduled_tokens
:
if
not
scheduler_output
.
total_num_scheduled_tokens
:
if
not
has_kv_transfer_group
():
if
not
has_kv_transfer_group
():
...
@@ -1381,6 +1384,34 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1381,6 +1384,34 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_tokens_across_dp
,
input_ids
,
positions
,
num_tokens_across_dp
,
input_ids
,
positions
,
inputs_embeds
,
scheduler_output
,
intermediate_tensors
,
inputs_embeds
,
scheduler_output
,
intermediate_tensors
,
skip_cuda_graphs
)
skip_cuda_graphs
)
elif
envs
.
VLLM_P2P_ASYNC
:
self
.
p2p_event
.
record
()
current_stream
=
torch
.
cuda
.
current_stream
()
with
torch
.
cuda
.
stream
(
self
.
p2p_stream
):
self
.
p2p_stream
.
wait_event
(
self
.
p2p_event
)
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
num_input_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
,
skip_cuda_graphs
=
skip_cuda_graphs
,
):
self
.
maybe_setup_kv_connector
(
scheduler_output
)
model_output
=
self
.
model
(
input_ids
=
input_ids
,
positions
=
positions
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
self
.
maybe_wait_for_kv_save
()
finished_sending
,
finished_recving
=
(
self
.
get_finished_kv_transfers
(
scheduler_output
))
self
.
p2p_event
.
record
()
current_stream
.
wait_event
(
self
.
p2p_event
)
else
:
else
:
# Run the model.
# Run the model.
# Use persistent buffers for CUDA graphs.
# Use persistent buffers for CUDA graphs.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment