fix performance issues caused by enabling TBO

5692ab61 · zhuwenwen · 533c4cbf · 5692ab61 · 5692ab61 · 5692ab61
Commit 5692ab61 authored Sep 06, 2025 by zhuwenwen
3 changed files
--- a/vllm/two_batch_overlap/v1/model_input_split_v1.py
+++ b/vllm/two_batch_overlap/v1/model_input_split_v1.py
@@ -269,17 +269,16 @@ def tbo_split_and_execute_model(
    inputs_embeds,
    scheduler_output: "SchedulerOutput",
    intermediate_tensors: Optional[IntermediateTensors] = None,
+    skip_cuda_graphs: bool = True,
 ) -> Union[ModelRunnerOutput, IntermediateTensors]:
    use_tbo = False
    if isinstance(runner.attn_metadata_builders[0], MLACommonMetadataBuilder) and \
        runner.attn_metadata_builders[0]._num_decodes > 0: #is mla decode
        use_tbo = False
    else:
-        if len(scheduler_output.num_scheduled_tokens) > 1:
+        if len(scheduler_output.num_scheduled_tokens) > 1 and num_input_tokens > envs.VLLM_TBO_MIN_TOKENS:
            split_scheduler_output(runner, scheduler_output)
-            if input_split.scheduler_output_left.total_num_scheduled_tokens >= envs.VLLM_TBO_MIN_TOKENS and \
-                input_split.scheduler_output_right.total_num_scheduled_tokens >= envs.VLLM_TBO_MIN_TOKENS:
-                use_tbo = True
+            use_tbo = True

    if use_tbo:
        num_input_tokens_left = input_split.scheduler_output_left.total_num_scheduled_tokens
@@ -303,11 +302,12 @@ def tbo_split_and_execute_model(
    else:
        # Run the decoder.
        # Use persistent buffers for CUDA graphs.
+        envs.VLLM_ENABLE_TBO = False
        with set_forward_context(attn_metadata,
                                runner.vllm_config,
                                num_tokens=num_input_tokens,
                                num_tokens_across_dp=num_tokens_across_dp,
-                                skip_cuda_graphs=True):
+                                skip_cuda_graphs=skip_cuda_graphs):
            runner.maybe_setup_kv_connector(scheduler_output)

            model_output = runner.model(
@@ -320,4 +320,5 @@ def tbo_split_and_execute_model(
            runner.maybe_wait_for_kv_save()
            finished_sending, finished_recving = (
                runner.get_finished_kv_transfers(scheduler_output))
+        envs.VLLM_ENABLE_TBO = True
    return model_output, finished_sending, finished_recving
\ No newline at end of file
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1616,7 +1616,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            model_output, finished_sending, finished_recving = \
                 tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
                                             num_tokens_across_dp, input_ids, positions,
-                                             inputs_embeds, scheduler_output, intermediate_tensors)
+                                             inputs_embeds, scheduler_output, intermediate_tensors,
+                                             skip_cuda_graphs)
        else:
            # Run the model.
            # Use persistent buffers for CUDA graphs.

--- a/vllm/zero_overhead/v1/gpu_model_runner.py
+++ b/vllm/zero_overhead/v1/gpu_model_runner.py
@@ -476,7 +476,8 @@ class V1ZeroModelRunner(GPUModelRunner):
            model_output, finished_sending, finished_recving = \
                 tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
                                             num_tokens_across_dp, input_ids, positions,
-                                             inputs_embeds, scheduler_output, intermediate_tensors)
+                                             inputs_embeds, scheduler_output, intermediate_tensors, 
+                                             skip_cuda_graphs)
        else:
            # Run the model.
            # Use persistent buffers for CUDA graphs.