Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
424cccfe
Commit
424cccfe
authored
Sep 04, 2025
by
lizhigong
Browse files
fix performance issues caused by enabling TBO
parent
f6f8db81
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
9 deletions
+10
-9
vllm/two_batch_overlap/v1/model_input_split_v1.py
vllm/two_batch_overlap/v1/model_input_split_v1.py
+6
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-1
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+2
-2
No files found.
vllm/two_batch_overlap/v1/model_input_split_v1.py
View file @
424cccfe
...
@@ -270,18 +270,16 @@ def tbo_split_and_execute_model(
...
@@ -270,18 +270,16 @@ def tbo_split_and_execute_model(
inputs_embeds
,
inputs_embeds
,
scheduler_output
:
"SchedulerOutput"
,
scheduler_output
:
"SchedulerOutput"
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
skip_cuda_graphs
:
bool
=
True
,
)
->
Union
[
ModelRunnerOutput
,
IntermediateTensors
]:
)
->
Union
[
ModelRunnerOutput
,
IntermediateTensors
]:
use_tbo
=
False
use_tbo
=
False
if
isinstance
(
runner
.
attn_metadata_builders
[
0
],
MLACommonMetadataBuilder
)
and
\
if
isinstance
(
runner
.
attn_metadata_builders
[
0
],
MLACommonMetadataBuilder
)
and
\
runner
.
attn_metadata_builders
[
0
].
_num_decodes
>
0
:
#is mla decode
runner
.
attn_metadata_builders
[
0
].
_num_decodes
>
0
:
#is mla decode
use_tbo
=
False
use_tbo
=
False
else
:
else
:
if
len
(
scheduler_output
.
num_scheduled_tokens
)
>
1
:
if
len
(
scheduler_output
.
num_scheduled_tokens
)
>
1
and
num_input_tokens
>
envs
.
VLLM_TBO_MIN_TOKENS
:
split_scheduler_output
(
runner
,
scheduler_output
)
split_scheduler_output
(
runner
,
scheduler_output
)
if
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
>=
envs
.
VLLM_TBO_MIN_TOKENS
and
\
use_tbo
=
True
input_split
.
scheduler_output_right
.
total_num_scheduled_tokens
>=
envs
.
VLLM_TBO_MIN_TOKENS
:
use_tbo
=
True
if
use_tbo
:
if
use_tbo
:
num_input_tokens_left
=
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
num_input_tokens_left
=
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
num_input_tokens_right
=
num_input_tokens
-
num_input_tokens_left
num_input_tokens_right
=
num_input_tokens
-
num_input_tokens_left
...
@@ -304,11 +302,12 @@ def tbo_split_and_execute_model(
...
@@ -304,11 +302,12 @@ def tbo_split_and_execute_model(
else
:
else
:
# Run the decoder.
# Run the decoder.
# Use persistent buffers for CUDA graphs.
# Use persistent buffers for CUDA graphs.
envs
.
VLLM_ENABLE_TBO
=
False
with
set_forward_context
(
attn_metadata
,
with
set_forward_context
(
attn_metadata
,
runner
.
vllm_config
,
runner
.
vllm_config
,
num_tokens
=
num_input_tokens
,
num_tokens
=
num_input_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
,
num_tokens_across_dp
=
num_tokens_across_dp
,
skip_cuda_graphs
=
True
):
skip_cuda_graphs
=
skip_cuda_graphs
):
runner
.
maybe_setup_kv_connector
(
scheduler_output
)
runner
.
maybe_setup_kv_connector
(
scheduler_output
)
model_output
=
runner
.
model
(
model_output
=
runner
.
model
(
...
@@ -321,4 +320,5 @@ def tbo_split_and_execute_model(
...
@@ -321,4 +320,5 @@ def tbo_split_and_execute_model(
runner
.
maybe_wait_for_kv_save
()
runner
.
maybe_wait_for_kv_save
()
finished_sending
,
finished_recving
=
(
finished_sending
,
finished_recving
=
(
runner
.
get_finished_kv_transfers
(
scheduler_output
))
runner
.
get_finished_kv_transfers
(
scheduler_output
))
envs
.
VLLM_ENABLE_TBO
=
True
return
model_output
,
finished_sending
,
finished_recving
return
model_output
,
finished_sending
,
finished_recving
\ No newline at end of file
vllm/v1/worker/gpu_model_runner.py
View file @
424cccfe
...
@@ -1377,7 +1377,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1377,7 +1377,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
model_output
,
finished_sending
,
finished_recving
=
\
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
num_tokens_across_dp
,
input_ids
,
positions
,
inputs_embeds
,
scheduler_output
,
intermediate_tensors
)
inputs_embeds
,
scheduler_output
,
intermediate_tensors
,
skip_cuda_graphs
)
else
:
else
:
# Run the model.
# Run the model.
# Use persistent buffers for CUDA graphs.
# Use persistent buffers for CUDA graphs.
...
...
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
424cccfe
...
@@ -472,12 +472,12 @@ class V1ZeroModelRunner(GPUModelRunner):
...
@@ -472,12 +472,12 @@ class V1ZeroModelRunner(GPUModelRunner):
# If attention doesn't support CUDA Graphs for this batch, but we
# If attention doesn't support CUDA Graphs for this batch, but we
# compiled with full CUDA graphs, we have to skip them entirely.
# compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs
=
self
.
full_cuda_graph
and
not
attention_cuda_graphs
skip_cuda_graphs
=
self
.
full_cuda_graph
and
not
attention_cuda_graphs
if
envs
.
VLLM_ENABLE_TBO
and
(
not
self
.
use_cuda_graph
or
skip_cuda_graphs
):
if
envs
.
VLLM_ENABLE_TBO
and
(
not
self
.
use_cuda_graph
or
skip_cuda_graphs
):
model_output
,
finished_sending
,
finished_recving
=
\
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
num_tokens_across_dp
,
input_ids
,
positions
,
inputs_embeds
,
scheduler_output
,
intermediate_tensors
)
inputs_embeds
,
scheduler_output
,
intermediate_tensors
,
skip_cuda_graphs
)
else
:
else
:
# Run the model.
# Run the model.
# Use persistent buffers for CUDA graphs.
# Use persistent buffers for CUDA graphs.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment