[Bugfix] Include inductor and functorch configs in compilation cache key (#40627)

Signed-off-by: Richard Zou <zou3519@gmail.com>

[Bugfix] Include inductor and functorch configs in compilation cache key (#40627)
Signed-off-by: Richard Zou <zou3519@gmail.com>
424033f4 · Richard Zou · GitHub · da1e7311 · 424033f4 · 424033f4
Unverified Commit 424033f4 authored Apr 23, 2026 by Richard Zou Committed by GitHub Apr 23, 2026
3 changed files
--- a/tests/compile/h100/test_startup.py
+++ b/tests/compile/h100/test_startup.py
@@ -56,6 +56,7 @@ def _cold_start(vllm_runner):
 def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
+    monkeypatch.setenv("VLLM_DEEP_GEMM_WARMUP", "skip")

    # Cold start in a forked child (must fork before CUDA init).
    # This model has 32 identical transformer layers which produce
@@ -235,6 +236,7 @@ def _cold_start_model(vllm_runner, spec: ModelStartupSpec):
 @fork_new_process_for_each_test
 def test_model_startup(monkeypatch, vllm_runner, fresh_vllm_cache, spec):
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_DEEP_GEMM_WARMUP", "skip")

    # Cold start in a forked child (must fork before CUDA init).
    ctx = mp.get_context("fork")

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -617,6 +617,24 @@ def test_inductor_asserts_enabled_in_debug(monkeypatch):
        assert config.inductor_compile_config.get("scalar_asserts") is True


+def test_get_inductor_factors_includes_configs():
+    """Changing inductor or functorch config must change the cache key factors."""
+    from torch._functorch import config as functorch_config
+    from torch._inductor import config as inductor_config
+
+    from vllm.compilation.compiler_interface import get_inductor_factors
+
+    baseline = get_inductor_factors()
+
+    with inductor_config.patch("max_autotune", not inductor_config.max_autotune):
+        patched = get_inductor_factors()
+    assert baseline != patched, "inductor config change was not reflected"
+
+    with functorch_config.patch("donated_buffer", not functorch_config.donated_buffer):
+        patched = get_inductor_factors()
+    assert baseline != patched, "functorch config change was not reflected"
+
+
 def test_inductor_asserts_user_override(monkeypatch):
    """Test that explicit inductor_compile_config overrides the
    debug-logging default."""

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -152,6 +152,17 @@ class AlwaysHitShapeEnv:
        return ""


+def _get_vllm_functorch_config() -> dict[str, Any]:
+    """Return the functorch config overrides that vLLM applies at compile time.
+
+    Used by both set_functorch_config() and get_inductor_factors() to ensure
+    the compile-time config and cache key are always consistent."""
+    cfg: dict[str, Any] = {}
+    if not envs.VLLM_USE_MEGA_AOT_ARTIFACT:
+        cfg["bundled_autograd_cache"] = False
+    return cfg
+
+
 def get_inductor_factors() -> list[Any]:
    factors: list[Any] = []
    # summarize system state
@@ -165,6 +176,13 @@ def get_inductor_factors() -> list[Any]:

    torch_factors = torch_key()
    factors.append(torch_factors)
+
+    from torch._functorch import config as functorch_config
+    from torch._inductor import config as inductor_config
+
+    factors.append(inductor_config.save_config_portable())
+    with functorch_config.patch(_get_vllm_functorch_config()):
+        factors.append(functorch_config.save_config_portable())
    return factors


@@ -739,8 +757,8 @@ def set_inductor_config(config: dict[str, Any], compile_range: Range) -> None:


 def set_functorch_config() -> None:
-    if not envs.VLLM_USE_MEGA_AOT_ARTIFACT:
-        torch._functorch.config.bundled_autograd_cache = False
+    for k, v in _get_vllm_functorch_config().items():
+        setattr(torch._functorch.config, k, v)


 class EagerAdaptor(CompilerInterface):