when use tbo, deepseek prefill use eager

3b7124f5 · zhuwenwen · aaef2077 · 3b7124f5 · 3b7124f5 · 3b7124f5
Commit 3b7124f5 authored Aug 27, 2025 by zhuwenwen
4 changed files
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -9,9 +9,10 @@ import torch
 import torch.nn as nn
 from torch._dynamo.symbolic_convert import InliningInstructionTranslator

+from vllm import envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.forward_context import get_profilling
+from vllm.forward_context import get_forward_context, get_profilling
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
@@ -203,6 +204,10 @@ def _support_torch_compile(
        # torch.compiler.is_compiling() means we are inside the compilation
        # e.g. TPU has the compilation logic in model runner, so we don't
        # need to compile the model inside.
+        skip_cuda_graphs = get_forward_context().skip_cuda_graphs
+        if envs.VLLM_ENABLE_TBO and skip_cuda_graphs:
+            return self.forward(*args, **kwargs)
+
        if self.do_not_compile or torch.compiler.is_compiling() or get_profilling():
            return self.forward(*args, **kwargs)


--- a/vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
+++ b/vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
@@ -72,7 +72,6 @@ class TwoBatchOverlap():
            init_tbo_forward_context(False, self.right_tid)
        with torch.cuda.stream(tbo_step_stream):
            queue.get()
-            profile.ProfRangePush('start')
            self.tbo_thread_synchronize(tid)
            if is_left_thread:
                attn_metadata = self.attn_metadata_left
@@ -104,22 +103,17 @@ class TwoBatchOverlap():
                self.states_left_queue.put(model_output)
            else:
                self.states_right_queue.put(model_output)
-            profile.ProfRangePop()

    def tbo_thread_synchronize(self, tid):
        if tid == self.left_tid:
            if not self.left_first:
                self.sem_right.release()
            self.left_first = False
-            profile.ProfRangePop()
            self.sem_left.acquire()
-            profile.ProfRangePush('left')
            return self.event_left_c2t, self.event_left_t2c
        else:
            self.sem_left.release()
-            profile.ProfRangePop()
            self.sem_right.acquire()
-            profile.ProfRangePush('right')
            return self.event_right_c2t, self.event_right_t2c

    def set_model_input(self,

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1612,7 +1612,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        cudagraph_runtime_mode, batch_descriptor = \
            self.cudagraph_dispatcher.dispatch(batch_descriptor)

-        if envs.VLLM_ENABLE_TBO and not self.use_cuda_graph:
+        if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs):
            model_output, finished_sending, finished_recving = \
                 tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
                                             num_tokens_across_dp, input_ids, positions,

--- a/vllm/zero_overhead/v1/gpu_model_runner.py
+++ b/vllm/zero_overhead/v1/gpu_model_runner.py
@@ -472,7 +472,7 @@ class V1ZeroModelRunner(GPUModelRunner):
        # compiled with full CUDA graphs, we have to skip them entirely.
        skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs

-        if envs.VLLM_ENABLE_TBO and not self.use_cuda_graph:
+        if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs):
            model_output, finished_sending, finished_recving = \
                 tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
                                             num_tokens_across_dp, input_ids, positions,