Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2dde535d
Unverified
Commit
2dde535d
authored
Mar 07, 2026
by
Richard Zou
Committed by
GitHub
Mar 07, 2026
Browse files
[compile] Split compile/warmup monitoring (#36098)
parent
379689d5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
125 additions
and
50 deletions
+125
-50
vllm/compilation/caching.py
vllm/compilation/caching.py
+25
-1
vllm/compilation/decorators.py
vllm/compilation/decorators.py
+41
-27
vllm/compilation/monitor.py
vllm/compilation/monitor.py
+59
-22
No files found.
vllm/compilation/caching.py
View file @
2dde535d
...
...
@@ -189,13 +189,13 @@ class VllmSerializableFunction(SerializableCallable): # type: ignore[misc]
self
.
shape_env
=
None
self
.
vllm_backend
=
vllm_backend
self
.
sym_tensor_indices
=
sym_tensor_indices
self
.
_fake_mode
:
Any
|
None
=
None
import
torch._functorch.config
as
functorch_config
self
.
aot_autograd_config
=
(
aot_autograd_config
or
functorch_config
.
save_config_portable
()
)
sym_input
=
next
(
(
i
for
i
in
self
.
example_inputs
if
isinstance
(
i
,
torch
.
SymInt
)),
None
)
...
...
@@ -217,6 +217,7 @@ class VllmSerializableFunction(SerializableCallable): # type: ignore[misc]
state
.
pop
(
"optimized_call"
)
state
.
pop
(
"shape_env"
)
state
.
pop
(
"vllm_backend"
,
None
)
state
.
pop
(
"_fake_mode"
,
None
)
for
node
in
state
[
"graph_module"
].
graph
.
nodes
:
node
.
meta
.
pop
(
"source_fn_stack"
,
None
)
node
.
meta
.
pop
(
"nn_module_stack"
,
None
)
...
...
@@ -351,8 +352,31 @@ class VllmSerializableFunction(SerializableCallable): # type: ignore[misc]
return
fn
.
optimized_call
(
*
example_inputs
)
fn
=
cls
(
**
state
,
optimized_call
=
optimized_call
)
fn
.
_fake_mode
=
fake_mode
return
fn
def
finalize_loading
(
self
,
vllm_config
:
VllmConfig
)
->
None
:
"""Eagerly initialize the compiled backend and perform all loading.
Must be called after _verify_source_unchanged has populated
compilation_config.traced_files, which is needed for cache dir
computation.
"""
if
self
.
_fake_mode
is
None
:
return
# Already finalized, or mega path (no _fake_mode set)
from
torch._guards
import
TracingContext
,
tracing
from
vllm.compilation.backends
import
VllmBackend
vllm_backend
=
VllmBackend
(
vllm_config
,
self
.
prefix
,
self
.
is_encoder
)
with
tracing
(
TracingContext
(
self
.
_fake_mode
)):
result
=
vllm_backend
(
self
.
graph_module
,
list
(
self
.
example_inputs
))
self
.
optimized_call
=
result
.
optimized_call
self
.
vllm_backend
=
vllm_backend
self
.
_fake_mode
=
None
@
property
def
co_name
(
self
)
->
Literal
[
"VllmSerializableFunction"
]:
"""
...
...
vllm/compilation/decorators.py
View file @
2dde535d
...
...
@@ -30,7 +30,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.utils.import_utils
import
resolve_obj_by_qualname
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
from
.monitor
import
start_
monitor
ing
_torch_compile
from
.monitor
import
monitor
_profiling_run
,
monitor
_torch_compile
if
TYPE_CHECKING
:
# Only added on nightly/2.10 so wrap
...
...
@@ -434,17 +434,24 @@ def _support_torch_compile(
cache_dir
=
os
.
path
.
join
(
cache_dir
,
f
"rank_
{
rank
}
_
{
dp_rank
}
"
)
aot_compilation_path
=
os
.
path
.
join
(
cache_dir
,
"model"
)
try
:
with
(
set_current_vllm_config
(
self
.
vllm_config
),
open
(
aot_compilation_path
,
"rb"
)
as
f
,
):
start_monitoring_torch_compile
(
self
.
vllm_config
)
loaded_fn
=
torch
.
compiler
.
load_compiled_function
(
f
,
f_globals
=
self
.
forward
.
__globals__
)
_verify_source_unchanged
(
loaded_fn
.
source_info
(),
self
.
vllm_config
)
if
not
self
.
compilation_config
.
dynamic_shapes_config
.
evaluate_guards
:
loaded_fn
.
disable_guard_check
()
with
monitor_torch_compile
(
self
.
vllm_config
):
with
(
set_current_vllm_config
(
self
.
vllm_config
),
open
(
aot_compilation_path
,
"rb"
)
as
f
,
):
loaded_fn
=
torch
.
compiler
.
load_compiled_function
(
f
,
f_globals
=
self
.
forward
.
__globals__
)
_verify_source_unchanged
(
loaded_fn
.
source_info
(),
self
.
vllm_config
)
ds_config
=
self
.
compilation_config
.
dynamic_shapes_config
if
not
ds_config
.
evaluate_guards
:
loaded_fn
.
disable_guard_check
()
# Eagerly load compiled artifacts now that traced_files
# is populated by _verify_source_unchanged.
with
maybe_use_cudagraph_partition_wrapper
(
self
.
vllm_config
):
loaded_fn
.
_artifacts
.
compiled_fn
.
finalize_loading
(
self
.
vllm_config
)
self
.
aot_compiled_fn
=
loaded_fn
self
.
was_aot_compile_fn_loaded_from_disk
=
True
except
Exception
as
e
:
...
...
@@ -465,12 +472,11 @@ def _support_torch_compile(
logger
.
info
(
"Directly load AOT compilation from path %s"
,
aot_compilation_path
)
# Apply partition wrapper context for proper CUDA graph capture
from
.
monitor
import
end_monitoring_torch_compile
with
maybe_use_cudagraph_partition_wrapper
(
self
.
vllm_config
):
with
(
monitor
_profiling_run
(),
maybe_use_cudagraph_partition_wrapper
(
self
.
vllm_config
),
):
output
=
self
.
aot_compiled_fn
(
self
,
*
args
,
**
kwargs
)
end_monitoring_torch_compile
(
self
.
vllm_config
)
return
output
if
self
.
compiled
:
...
...
@@ -489,8 +495,6 @@ def _support_torch_compile(
**
kwargs
,
)
# here, it is the starting point of the `torch.compile` process
start_monitoring_torch_compile
(
self
.
vllm_config
)
original_code_object
=
self
.
original_code_object
()
logger
.
debug
(
"Start compiling function %s"
,
original_code_object
)
...
...
@@ -559,16 +563,26 @@ def _support_torch_compile(
# store the path for saving after warmup
self
.
_aot_compilation_path
=
aot_compilation_path
self
.
_aot_cache_dir
=
cache_dir
self
.
aot_compiled_fn
=
self
.
aot_compile
(
*
args
,
**
kwargs
)
# All compilation is done at this point, save the AOT artifact.
self
.
save_aot_compiled_function
()
output
=
self
.
aot_compiled_fn
(
self
,
*
args
,
**
kwargs
)
else
:
output
=
TorchCompileWithNoGuardsWrapper
.
__call__
(
self
,
*
args
,
**
kwargs
)
# type: ignore[arg-type]
with
monitor_torch_compile
(
self
.
vllm_config
):
self
.
aot_compiled_fn
=
self
.
aot_compile
(
*
args
,
**
kwargs
)
# All compilation is done at this point, save the
# AOT artifact.
self
.
save_aot_compiled_function
()
from
.monitor
import
end_monitoring_torch_compile
with
monitor_profiling_run
():
output
=
self
.
aot_compiled_fn
(
self
,
*
args
,
**
kwargs
)
else
:
with
monitor_torch_compile
(
self
.
vllm_config
,
"torch.compile and initial profiling/warmup "
"run together took %.2f s in total"
,
):
output
=
TorchCompileWithNoGuardsWrapper
.
__call__
(
self
,
# type: ignore[arg-type]
*
args
,
**
kwargs
,
)
end_monitoring_torch_compile
(
self
.
vllm_config
)
self
.
compiled
=
True
return
output
...
...
vllm/compilation/monitor.py
View file @
2dde535d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
time
from
collections.abc
import
Generator
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
VllmConfig
from
vllm.config
import
CompilationMode
,
VllmConfig
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
context_manager
=
None
# Shared global so backends.py can read the start time for Dynamo timing.
torch_compile_start_time
:
float
=
0.0
def
start_monitoring_torch_compile
(
vllm_config
:
VllmConfig
)
->
None
:
@
contextlib
.
contextmanager
def
monitor_torch_compile
(
vllm_config
:
VllmConfig
,
message
:
str
=
"torch.compile took %.2f s in total"
,
)
->
Generator
[
None
,
None
,
None
]:
"""Context manager that times torch.compile and manages depyf debugging.
On normal exit: logs the compile time and exits depyf.
On exception: cleans up depyf without logging (compilation failed).
"""
global
torch_compile_start_time
torch_compile_start_time
=
time
.
perf_counter
()
compilation_config
:
CompilationConfig
=
vllm_config
.
compilation_config
compilation_config
=
vllm_config
.
compilation_config
depyf_cm
=
None
path
=
vllm_config
.
compile_debug_dump_path
()
if
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
and
path
:
import
depyf
path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
logger
.
debug
(
"Dumping depyf output to %s"
,
path
)
global
context_manager
context_manager
=
depyf
.
prepare_debug
(
path
.
as_posix
())
context_manager
.
__enter__
()
def
end_monitoring_torch_compile
(
vllm_config
:
VllmConfig
)
->
None
:
compilation_config
:
CompilationConfig
=
vllm_config
.
compilation_config
total_compile_time
:
float
=
time
.
perf_counter
()
-
torch_compile_start_time
if
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
:
logger
.
info_once
(
"torch.compile and initial profiling run took %.2f s in total"
,
total_compile_time
,
scope
=
"local"
,
)
global
context_manager
if
context_manager
is
not
None
:
context_manager
.
__exit__
(
None
,
None
,
None
)
context_manager
=
None
depyf_cm
=
depyf
.
prepare_debug
(
path
.
as_posix
())
depyf_cm
.
__enter__
()
try
:
yield
except
Exception
:
raise
else
:
total_compile_time
=
time
.
perf_counter
()
-
torch_compile_start_time
if
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
:
logger
.
info_once
(
message
,
total_compile_time
,
scope
=
"local"
)
finally
:
if
depyf_cm
is
not
None
:
try
:
depyf_cm
.
__exit__
(
None
,
None
,
None
)
except
Exception
:
logger
.
warning
(
"Exception during depyf cleanup."
,
exc_info
=
True
)
@
contextlib
.
contextmanager
def
monitor_profiling_run
()
->
Generator
[
None
,
None
,
None
]:
"""Context manager that times the initial profiling run.
Asserts that no backend compilation occurs during the profiling run
(all compilation should have completed before this point).
"""
from
vllm.compilation.counter
import
compilation_counter
backend_compilations_before
=
compilation_counter
.
num_backend_compilations
start
=
time
.
perf_counter
()
yield
elapsed
=
time
.
perf_counter
()
-
start
assert
(
compilation_counter
.
num_backend_compilations
==
backend_compilations_before
),
(
"backend compilation occurred during the initial profiling run; "
"all compilation should be complete before the profiling run starts."
)
logger
.
info_once
(
"Initial profiling/warmup run took %.2f s"
,
elapsed
,
scope
=
"local"
,
)
cudagraph_capturing_enabled
:
bool
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment