Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
109c414a
Commit
109c414a
authored
Sep 18, 2025
by
zhuwenwen
Browse files
fix the performance issue of tbo pd separation
parent
e37d6cc3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
2 deletions
+20
-2
vllm/attention/layer.py
vllm/attention/layer.py
+9
-2
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+3
-0
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
+8
-0
No files found.
vllm/attention/layer.py
View file @
109c414a
...
@@ -7,6 +7,7 @@ import torch
...
@@ -7,6 +7,7 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
vllm.two_batch_overlap.v1.two_batch_overlap_v1
import
tbo_maybe_save_kv_layer_to_connector
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.attention
import
AttentionType
from
vllm.attention
import
AttentionType
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.backends.abstract
import
AttentionBackend
...
@@ -480,7 +481,10 @@ def unified_attention(
...
@@ -480,7 +481,10 @@ def unified_attention(
output
=
self
.
impl
.
forward
(
self
,
query
,
key
,
value
,
kv_cache
,
output
=
self
.
impl
.
forward
(
self
,
query
,
key
,
value
,
kv_cache
,
attn_metadata
)
attn_metadata
)
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
if
envs
.
VLLM_ENABLE_TBO
:
tbo_maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
else
:
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
return
output
return
output
...
@@ -528,7 +532,10 @@ def unified_attention_with_output(
...
@@ -528,7 +532,10 @@ def unified_attention_with_output(
output_scale
=
output_scale
,
output_scale
=
output_scale
,
output_block_scale
=
output_block_scale
)
output_block_scale
=
output_block_scale
)
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
if
envs
.
VLLM_ENABLE_TBO
:
tbo_maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
else
:
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
def
unified_attention_with_output_fake
(
def
unified_attention_with_output_fake
(
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
109c414a
...
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
...
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
import
regex
as
re
import
regex
as
re
import
torch
import
torch
from
vllm
import
envs
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
KVConnectorBase_V1
,
KVConnectorMetadata
,
KVConnectorRole
)
KVConnectorBase_V1
,
KVConnectorMetadata
,
KVConnectorRole
)
...
@@ -262,6 +263,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
...
@@ -262,6 +263,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
torch.Tensor: A tensor containing the extracted KV slices.
torch.Tensor: A tensor containing the extracted KV slices.
Returns None if the layout is unsupported.
Returns None if the layout is unsupported.
"""
"""
if
envs
.
VLLM_ENABLE_TBO
:
slot_mapping
=
slot_mapping
.
pin_memory
().
to
(
device
=
layer
.
device
,
non_blocking
=
True
)
if
(
isinstance
(
attn_metadata
,
MLACommonMetadata
)
if
(
isinstance
(
attn_metadata
,
MLACommonMetadata
)
or
layer
.
shape
[
1
]
==
2
):
# MLA or FlashInfer
or
layer
.
shape
[
1
]
==
2
):
# MLA or FlashInfer
return
layer
[
block_ids
,
...]
return
layer
[
block_ids
,
...]
...
...
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
View file @
109c414a
...
@@ -162,6 +162,14 @@ def init_two_batch_overlap():
...
@@ -162,6 +162,14 @@ def init_two_batch_overlap():
tbo_obj_v1
=
TwoBatchOverlap
()
tbo_obj_v1
=
TwoBatchOverlap
()
tbo_obj_v1
.
init_tbo_thread
()
tbo_obj_v1
.
init_tbo_thread
()
def
tbo_maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
):
from
vllm.attention.layer
import
maybe_save_kv_layer_to_connector
if
envs
.
VLLM_ENABLE_TBO
and
tbo_obj_v1
!=
None
and
tbo_obj_v1
.
tbo_running
:
tid
=
threading
.
get_ident
()
if
tid
==
tbo_obj_v1
.
left_tid
:
return
maybe_save_kv_layer_to_connector
(
layer_name
,
kv_cache
)
def
tbo_all_reduce_v1
(
obj
):
def
tbo_all_reduce_v1
(
obj
):
if
envs
.
VLLM_ENABLE_TBO
and
tbo_obj_v1
!=
None
and
tbo_obj_v1
.
tbo_running
:
if
envs
.
VLLM_ENABLE_TBO
and
tbo_obj_v1
!=
None
and
tbo_obj_v1
.
tbo_running
:
tid
=
threading
.
get_ident
()
tid
=
threading
.
get_ident
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment