Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4df841fe
Unverified
Commit
4df841fe
authored
Feb 08, 2026
by
Richard Zou
Committed by
GitHub
Feb 08, 2026
Browse files
[torch.compile] Add an option to force-enable the MOE cold start optimization (#33735)
Signed-off-by:
Richard Zou
<
zou3519@gmail.com
>
parent
a263aa61
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
18 additions
and
12 deletions
+18
-12
vllm/config/compilation.py
vllm/config/compilation.py
+9
-3
vllm/config/vllm.py
vllm/config/vllm.py
+8
-0
vllm/forward_context.py
vllm/forward_context.py
+1
-9
No files found.
vllm/config/compilation.py
View file @
4df841fe
...
...
@@ -593,7 +593,7 @@ class CompilationConfig:
local_cache_dir
:
str
=
field
(
default
=
None
,
init
=
False
)
# type: ignore
"""local cache dir for each rank"""
fast_moe_cold_start
=
Tru
e
fast_moe_cold_start
:
bool
|
None
=
Non
e
"""Optimization for fast MOE cold start.
This is a bit of a hack that assumes that:
...
...
@@ -604,8 +604,14 @@ class CompilationConfig:
When the above two conditions hold, this option greatly decreases cold start
time for MOE models.
If the above two conditions don't hold, then this option will lead to silent
incorrectness. The only condition in which this doesn't hold is speculative
The options are:
- True: optimization is always on
- False: optimization is always off
- None: optimization is on usually but off for speculative decoding
If conditions 1&2 don't hold then this option will lead to silent
incorrectness.
The only condition in which this doesn't hold is speculative
decoding, where there is a draft model that may have MOEs in them.
NB: We're working on a longer-term solution that doesn't need these assumptions.
...
...
vllm/config/vllm.py
View file @
4df841fe
...
...
@@ -806,6 +806,14 @@ class VllmConfig:
else
:
self
.
compilation_config
.
custom_ops
.
append
(
"+rms_norm"
)
if
self
.
compilation_config
.
fast_moe_cold_start
is
None
:
# resolve default behavior: try to be as safe as possible
# this config is unsafe if any spec decoding draft model has a MOE.
# We'll conservatively turn it off if we see spec decoding.
self
.
compilation_config
.
fast_moe_cold_start
=
(
self
.
speculative_config
is
None
)
if
current_platform
.
support_static_graph_mode
():
# if cudagraph_mode has full cudagraphs, we need to check support
if
model_config
:
=
self
.
model_config
:
...
...
vllm/forward_context.py
View file @
4df841fe
...
...
@@ -287,15 +287,7 @@ def create_forward_context(
skip_compiled
:
bool
=
False
,
):
if
vllm_config
.
compilation_config
.
fast_moe_cold_start
:
if
vllm_config
.
speculative_config
is
None
:
all_moe_layers
=
vllm_config
.
compilation_config
.
static_all_moe_layers
else
:
logger
.
warning_once
(
"vllm_config.compilation_config.fast_moe_cold_start is not "
"compatible with speculative decoding so we are ignoring "
"fast_moe_cold_start."
)
all_moe_layers
=
None
all_moe_layers
=
vllm_config
.
compilation_config
.
static_all_moe_layers
else
:
all_moe_layers
=
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment