Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f8985b96
Commit
f8985b96
authored
Sep 17, 2025
by
lizhigong
Browse files
fix the performance issue of tbo pd separation
parent
5eec6110
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
3 deletions
+20
-3
vllm/attention/layer.py
vllm/attention/layer.py
+9
-3
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+3
-0
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
+8
-0
No files found.
vllm/attention/layer.py
View file @
f8985b96
...
...
@@ -7,6 +7,7 @@ import torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
vllm.two_batch_overlap.v1.two_batch_overlap_v1
import
tbo_maybe_save_kv_layer_to_connector
import
vllm.envs
as
envs
from
vllm.attention
import
AttentionType
from
vllm.attention.selector
import
backend_name_to_enum
,
get_attn_backend
...
...
@@ -412,7 +413,10 @@ def unified_attention(
output
=
self
.
impl
.
forward
(
self
,
query
,
key
,
value
,
kv_cache
,
attn_metadata
)
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
if
envs
.
VLLM_ENABLE_TBO
:
tbo_maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
else
:
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
return
output
...
...
@@ -457,8 +461,10 @@ def unified_attention_with_output(
attn_metadata
,
output
=
output
,
output_scale
=
output_scale
)
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
if
envs
.
VLLM_ENABLE_TBO
:
tbo_maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
else
:
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
def
unified_attention_with_output_fake
(
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
f8985b96
...
...
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
import
regex
as
re
import
torch
from
vllm
import
envs
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
KVConnectorBase_V1
,
KVConnectorMetadata
,
KVConnectorRole
)
...
...
@@ -267,6 +268,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
Assume the shape of the layer is (2, num_pages, page_size, xxx)
if MLA is not used, and (num_pages, page_size, xxx) otherwise.
"""
if
envs
.
VLLM_ENABLE_TBO
:
slot_mapping
=
slot_mapping
.
pin_memory
().
to
(
device
=
layer
.
device
,
non_blocking
=
True
)
if
isinstance
(
attn_metadata
,
MLACommonMetadata
):
num_pages
,
page_size
=
layer
.
shape
[
0
],
layer
.
shape
[
1
]
return
layer
.
reshape
(
num_pages
*
page_size
,
-
1
)[
slot_mapping
,
...
...
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
View file @
f8985b96
...
...
@@ -162,6 +162,14 @@ def init_two_batch_overlap():
tbo_obj_v1
=
TwoBatchOverlap
()
tbo_obj_v1
.
init_tbo_thread
()
def
tbo_maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
):
from
vllm.attention.layer
import
maybe_save_kv_layer_to_connector
if
envs
.
VLLM_ENABLE_TBO
and
tbo_obj_v1
!=
None
and
tbo_obj_v1
.
tbo_running
:
tid
=
threading
.
get_ident
()
if
tid
==
tbo_obj_v1
.
left_tid
:
return
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
def
tbo_all_reduce_v1
(
obj
):
if
envs
.
VLLM_ENABLE_TBO
and
tbo_obj_v1
!=
None
and
tbo_obj_v1
.
tbo_running
:
tid
=
threading
.
get_ident
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment