Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3b7124f5
Commit
3b7124f5
authored
Aug 27, 2025
by
zhuwenwen
Browse files
when use tbo, deepseek prefill use eager
parent
aaef2077
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
8 additions
and
9 deletions
+8
-9
vllm/compilation/decorators.py
vllm/compilation/decorators.py
+6
-1
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
+0
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-1
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+1
-1
No files found.
vllm/compilation/decorators.py
View file @
3b7124f5
...
...
@@ -9,9 +9,10 @@ import torch
import
torch.nn
as
nn
from
torch._dynamo.symbolic_convert
import
InliningInstructionTranslator
from
vllm
import
envs
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.wrapper
import
TorchCompileWrapperWithCustomDispatcher
from
vllm.forward_context
import
get_profilling
from
vllm.forward_context
import
get_forward_context
,
get_profilling
from
vllm.config
import
CompilationLevel
,
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.sequence
import
IntermediateTensors
...
...
@@ -203,6 +204,10 @@ def _support_torch_compile(
# torch.compiler.is_compiling() means we are inside the compilation
# e.g. TPU has the compilation logic in model runner, so we don't
# need to compile the model inside.
skip_cuda_graphs
=
get_forward_context
().
skip_cuda_graphs
if
envs
.
VLLM_ENABLE_TBO
and
skip_cuda_graphs
:
return
self
.
forward
(
*
args
,
**
kwargs
)
if
self
.
do_not_compile
or
torch
.
compiler
.
is_compiling
()
or
get_profilling
():
return
self
.
forward
(
*
args
,
**
kwargs
)
...
...
vllm/two_batch_overlap/v1/two_batch_overlap_v1.py
View file @
3b7124f5
...
...
@@ -72,7 +72,6 @@ class TwoBatchOverlap():
init_tbo_forward_context
(
False
,
self
.
right_tid
)
with
torch
.
cuda
.
stream
(
tbo_step_stream
):
queue
.
get
()
profile
.
ProfRangePush
(
'start'
)
self
.
tbo_thread_synchronize
(
tid
)
if
is_left_thread
:
attn_metadata
=
self
.
attn_metadata_left
...
...
@@ -104,22 +103,17 @@ class TwoBatchOverlap():
self
.
states_left_queue
.
put
(
model_output
)
else
:
self
.
states_right_queue
.
put
(
model_output
)
profile
.
ProfRangePop
()
def
tbo_thread_synchronize
(
self
,
tid
):
if
tid
==
self
.
left_tid
:
if
not
self
.
left_first
:
self
.
sem_right
.
release
()
self
.
left_first
=
False
profile
.
ProfRangePop
()
self
.
sem_left
.
acquire
()
profile
.
ProfRangePush
(
'left'
)
return
self
.
event_left_c2t
,
self
.
event_left_t2c
else
:
self
.
sem_left
.
release
()
profile
.
ProfRangePop
()
self
.
sem_right
.
acquire
()
profile
.
ProfRangePush
(
'right'
)
return
self
.
event_right_c2t
,
self
.
event_right_t2c
def
set_model_input
(
self
,
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
3b7124f5
...
...
@@ -1612,7 +1612,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cudagraph_runtime_mode
,
batch_descriptor
=
\
self
.
cudagraph_dispatcher
.
dispatch
(
batch_descriptor
)
if
envs
.
VLLM_ENABLE_TBO
and
not
self
.
use_cuda_graph
:
if
envs
.
VLLM_ENABLE_TBO
and
(
not
self
.
use_cuda_graph
or
skip_cuda_graphs
)
:
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
...
...
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
3b7124f5
...
...
@@ -472,7 +472,7 @@ class V1ZeroModelRunner(GPUModelRunner):
# compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs
=
self
.
full_cuda_graph
and
not
attention_cuda_graphs
if
envs
.
VLLM_ENABLE_TBO
and
not
self
.
use_cuda_graph
:
if
envs
.
VLLM_ENABLE_TBO
and
(
not
self
.
use_cuda_graph
or
skip_cuda_graphs
)
:
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment