Commit 645fcfd9 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-lzg' into 'v0.9.2-dev'

when use tbo, deepseek prefill use eager

See merge request dcutoolkit/deeplearing/vllm!189
parents 20316346 8702264b
...@@ -9,9 +9,10 @@ import torch ...@@ -9,9 +9,10 @@ import torch
import torch.nn as nn import torch.nn as nn
from torch._dynamo.symbolic_convert import InliningInstructionTranslator from torch._dynamo.symbolic_convert import InliningInstructionTranslator
from vllm import envs
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
from vllm.forward_context import get_profilling from vllm.forward_context import get_forward_context, get_profilling
from vllm.config import CompilationLevel, VllmConfig from vllm.config import CompilationLevel, VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
...@@ -170,6 +171,10 @@ def _support_torch_compile( ...@@ -170,6 +171,10 @@ def _support_torch_compile(
# torch.compiler.is_compiling() means we are inside the compilation # torch.compiler.is_compiling() means we are inside the compilation
# e.g. TPU has the compilation logic in model runner, so we don't # e.g. TPU has the compilation logic in model runner, so we don't
# need to compile the model inside. # need to compile the model inside.
skip_cuda_graphs = get_forward_context().skip_cuda_graphs
if envs.VLLM_ENABLE_TBO and skip_cuda_graphs:
return self.forward(*args, **kwargs)
if self.do_not_compile or torch.compiler.is_compiling() or get_profilling(): if self.do_not_compile or torch.compiler.is_compiling() or get_profilling():
return self.forward(*args, **kwargs) return self.forward(*args, **kwargs)
......
...@@ -72,7 +72,6 @@ class TwoBatchOverlap(): ...@@ -72,7 +72,6 @@ class TwoBatchOverlap():
init_tbo_forward_context(False, self.right_tid) init_tbo_forward_context(False, self.right_tid)
with torch.cuda.stream(tbo_step_stream): with torch.cuda.stream(tbo_step_stream):
queue.get() queue.get()
profile.ProfRangePush('start')
self.tbo_thread_synchronize(tid) self.tbo_thread_synchronize(tid)
if is_left_thread: if is_left_thread:
attn_metadata = self.attn_metadata_left attn_metadata = self.attn_metadata_left
...@@ -104,22 +103,17 @@ class TwoBatchOverlap(): ...@@ -104,22 +103,17 @@ class TwoBatchOverlap():
self.states_left_queue.put(model_output) self.states_left_queue.put(model_output)
else: else:
self.states_right_queue.put(model_output) self.states_right_queue.put(model_output)
profile.ProfRangePop()
def tbo_thread_synchronize(self, tid): def tbo_thread_synchronize(self, tid):
if tid == self.left_tid: if tid == self.left_tid:
if not self.left_first: if not self.left_first:
self.sem_right.release() self.sem_right.release()
self.left_first = False self.left_first = False
profile.ProfRangePop()
self.sem_left.acquire() self.sem_left.acquire()
profile.ProfRangePush('left')
return self.event_left_c2t, self.event_left_t2c return self.event_left_c2t, self.event_left_t2c
else: else:
self.sem_left.release() self.sem_left.release()
profile.ProfRangePop()
self.sem_right.acquire() self.sem_right.acquire()
profile.ProfRangePush('right')
return self.event_right_c2t, self.event_right_t2c return self.event_right_c2t, self.event_right_t2c
def set_model_input(self, def set_model_input(self,
......
...@@ -1373,7 +1373,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1373,7 +1373,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# compiled with full CUDA graphs, we have to skip them entirely. # compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
if envs.VLLM_ENABLE_TBO and not self.use_cuda_graph: if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs):
model_output, finished_sending, finished_recving = \ model_output, finished_sending, finished_recving = \
tbo_split_and_execute_model(self, attn_metadata, num_input_tokens, tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
num_tokens_across_dp, input_ids, positions, num_tokens_across_dp, input_ids, positions,
......
...@@ -473,7 +473,7 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -473,7 +473,7 @@ class V1ZeroModelRunner(GPUModelRunner):
# compiled with full CUDA graphs, we have to skip them entirely. # compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
if envs.VLLM_ENABLE_TBO and not self.use_cuda_graph: if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs):
model_output, finished_sending, finished_recving = \ model_output, finished_sending, finished_recving = \
tbo_split_and_execute_model(self, attn_metadata, num_input_tokens, tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
num_tokens_across_dp, input_ids, positions, num_tokens_across_dp, input_ids, positions,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment