Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
81fe69ca
Unverified
Commit
81fe69ca
authored
Feb 07, 2026
by
Richard Zou
Committed by
GitHub
Feb 07, 2026
Browse files
[torch.compile] Stop compiling identical artifacts (#34003)
Signed-off-by:
Richard Zou
<
zou3519@gmail.com
>
parent
dd6a6e11
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
64 additions
and
11 deletions
+64
-11
tests/compile/test_cold_start.py
tests/compile/test_cold_start.py
+4
-4
vllm/compilation/backends.py
vllm/compilation/backends.py
+60
-7
No files found.
tests/compile/test_cold_start.py
View file @
81fe69ca
...
@@ -37,12 +37,12 @@ def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
...
@@ -37,12 +37,12 @@ def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
# The forward pass consists of 32 transformer layers.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# 33 subgraphs (not including the attention operation).
#
The 33 subgraphs then get standalone_compile'd
.
#
We then generate compiled artifacts for the unique subgraphs
.
#
#
# There are actually only 3 unique subgraphs for this model
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# this is true for most vLLM models.
# So we test that during cold start,
th
e a
ot_autograd cache
# So we test that during cold start,
w
e a
re only compling
#
misses for 3 subgraphs and hits for the rest
.
#
for 3 unique subgraphs
.
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
3
0
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
0
vllm/compilation/backends.py
View file @
81fe69ca
...
@@ -134,6 +134,7 @@ class CompilerManager:
...
@@ -134,6 +134,7 @@ class CompilerManager:
self
.
is_cache_updated
=
False
self
.
is_cache_updated
=
False
self
.
compilation_config
=
compilation_config
self
.
compilation_config
=
compilation_config
self
.
compiler
=
make_compiler
(
compilation_config
)
self
.
compiler
=
make_compiler
(
compilation_config
)
self
.
loaded_artifacts
:
dict
[
str
,
Any
]
=
{}
def
compute_hash
(
self
,
vllm_config
:
VllmConfig
)
->
str
:
def
compute_hash
(
self
,
vllm_config
:
VllmConfig
)
->
str
:
return
self
.
compiler
.
compute_hash
(
vllm_config
)
return
self
.
compiler
.
compute_hash
(
vllm_config
)
...
@@ -282,13 +283,61 @@ class CompilerManager:
...
@@ -282,13 +283,61 @@ class CompilerManager:
maybe_key
+=
f
"
{
compile_range
.
start
}
_
{
compile_range
.
end
}
"
maybe_key
+=
f
"
{
compile_range
.
start
}
_
{
compile_range
.
end
}
"
maybe_key
+=
f
"_subgraph_
{
graph_index
}
"
maybe_key
+=
f
"_subgraph_
{
graph_index
}
"
with
self
.
compile_context
(
compile_range
):
with
self
.
compile_context
(
compile_range
):
compiled_graph
,
handle
=
self
.
compiler
.
compile
(
# There is a compilation time optimization here.
graph
,
#
example_inputs
,
# If the (input metdata, graph, compiler config) are the same, then
additional_inductor_config
,
# we want to avoid compiling the same artifact again. If we didn't
compile_range
,
# do this optimization, the backend compilation (InductorAdaptor or
maybe_key
,
# InductorStandaloneAdaptor)
)
# is able to cache hit and produce an artifact faster if it was
# already created, but it is still a duplicate artifact that
# requires unnecessary things e.g. disk IO.
#
# The optimization is: If the backend compilation cache hits,
# then do an early return from the backend compilation and look up
# which of the previous in-memory artifacts we created to reuse.
#
# We implemented this by monkey-patching torch (torch does not
# easily expose the cache_key function), but in the future torch
# should expose the cache_key function that we can just call
# directly before invoking backend compilation.
cache_key
=
None
orig
=
torch
.
_functorch
.
_aot_autograd
.
autograd_cache
.
autograd_cache_key
def
autograd_cache_key
(
*
args
,
**
kwargs
):
result
=
orig
(
*
args
,
**
kwargs
)
if
result
is
None
:
return
None
nonlocal
cache_key
cache_key
=
result
[
0
]
if
cache_key
in
self
.
loaded_artifacts
:
raise
StopCompiling
()
return
result
from
unittest.mock
import
patch
with
(
# Graphs that are isometric (different node names but same
# structure) should be treated as the same.
torch
.
_functorch
.
config
.
patch
(
autograd_cache_normalize_inputs
=
True
),
patch
(
"torch._functorch._aot_autograd.autograd_cache.autograd_cache_key"
,
autograd_cache_key
,
),
):
try
:
compiled_graph
,
handle
=
self
.
compiler
.
compile
(
graph
,
example_inputs
,
additional_inductor_config
,
compile_range
,
maybe_key
,
)
except
StopCompiling
:
assert
cache_key
is
not
None
return
self
.
loaded_artifacts
[
cache_key
]
if
cache_key
is
not
None
and
compiled_graph
is
not
None
:
self
.
loaded_artifacts
[
cache_key
]
=
compiled_graph
assert
compiled_graph
is
not
None
,
"Failed to compile the graph"
assert
compiled_graph
is
not
None
,
"Failed to compile the graph"
...
@@ -326,6 +375,10 @@ class CompilerManager:
...
@@ -326,6 +375,10 @@ class CompilerManager:
return
compiled_graph
return
compiled_graph
class
StopCompiling
(
BaseException
):
pass
@
dataclasses
.
dataclass
@
dataclasses
.
dataclass
class
SplitItem
:
class
SplitItem
:
submod_name
:
str
submod_name
:
str
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment