[torch.compile] Improve cold and warm start compile tests (#35709)

Signed-off-by: Richard Zou <zou3519@gmail.com>

[torch.compile] Improve cold and warm start compile tests (#35709)
Signed-off-by: Richard Zou <zou3519@gmail.com>
d1a6e96d · Richard Zou · GitHub · 2a9e3347 · 2a9e3347 · d1a6e96d
Unverified Commit d1a6e96d authored Mar 02, 2026 by Richard Zou Committed by GitHub Mar 02, 2026
5 changed files
--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from torch._dynamo.utils import counters
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
-def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
-    # Run in same process so we can access PyTorch's internal counters
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
-    # I'm not sure if this is going to affect the numbers
-    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
-    # Force cold compilation
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-    compilation_config = CompilationConfig(
-        mode=CompilationMode.VLLM_COMPILE,
-        cudagraph_mode=CUDAGraphMode.NONE,  # make the model loading faster
-    )
-    counters.clear()
-    _ = LLM(
-        model="microsoft/Phi-tiny-MoE-instruct",
-        max_model_len=256,
-        load_format="dummy",  # make the model loading faster
-        compilation_config=compilation_config,
-        num_gpu_blocks_override=8,  # make the model loading faster
-    )
-    # vLLM-compile cold start is special. By default, we do
-    # one full dynamo capture of the entire forward pass.
-    # The forward pass consists of 32 transformer layers.
-    # Then, we split on the attention operation. This results in
-    # 33 subgraphs (not including the attention operation).
-    # We then generate compiled artifacts for the unique subgraphs.
-    #
-    # There are actually only 3 unique subgraphs for this model
-    # (all of its transformer layers are the same modulo weights);
-    # this is true for most vLLM models.
-    # So we test that during cold start, we are only compling
-    # for 3 unique subgraphs.
-    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
-    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cold start and warm start tests for vLLM-compile.
+Cold start runs in a forked child (must fork before CUDA init) which
+populates on-disk caches and asserts cold-start counters.  Warm start
+then runs in the parent with clean in-memory state but populated caches.
+"""
+import multiprocessing as mp
+from torch._dynamo.utils import counters
+from vllm.compilation.counter import compilation_counter
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+MODEL = "microsoft/Phi-tiny-MoE-instruct"
+def _run_vllm(vllm_runner):
+    with vllm_runner(
+        MODEL,
+        trust_remote_code=False,
+        max_model_len=256,
+        max_num_batched_tokens=1024,
+        load_format="dummy",
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode=CUDAGraphMode.NONE,
+        ),
+        num_gpu_blocks_override=8,
+    ):
+        pass
+def _cold_start(vllm_runner):
+    counters.clear()
+    with compilation_counter.expect(
+        num_compiled_artifacts_saved=3,
+        num_compiled_artifacts_loaded=0,
+    ):
+        _run_vllm(vllm_runner)
+    assert counters["aot_autograd"]["total"] == 33
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    # Cold start in a forked child (must fork before CUDA init).
+    # This model has 32 identical transformer layers which produce
+    # 33 subgraphs after splitting on attention — only 3 are unique.
+    ctx = mp.get_context("fork")
+    p = ctx.Process(target=_cold_start, args=(vllm_runner,))
+    p.start()
+    p.join()
+    assert p.exitcode == 0, "Cold-start child failed"
+    # Warm start — compiled artifacts loaded from disk cache.
+    counters.clear()
+    with compilation_counter.expect(
+        num_compiled_artifacts_loaded=3,
+        # TODO: warm start should not save any artifacts
+        # https://github.com/vllm-project/vllm/issues/35708
+        num_compiled_artifacts_saved=1,
+    ):
+        _run_vllm(vllm_runner)
+    assert counters["aot_autograd"]["total"] == 30
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 0
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 1
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1548,6 +1548,14 @@ def use_fresh_inductor_cache():
        yield
+@pytest.fixture
+def fresh_vllm_cache(monkeypatch, use_fresh_inductor_cache):
+    """Temporary VLLM_CACHE_ROOT combined with a fresh inductor cache."""
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        monkeypatch.setenv("VLLM_CACHE_ROOT", tmp_dir)
+        yield tmp_dir
 @pytest.fixture(scope="function")
 def enable_pickle(monkeypatch):
    """`LLM.apply_model` requires pickling a function."""

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -368,6 +368,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
        inductor_compiled_graph = torch._inductor.CompiledArtifact.load(
            path=path, format=self.save_format
        )
+        compilation_counter.num_compiled_artifacts_loaded += 1
        from torch._inductor.compile_fx import graph_returns_tuple
        returns_tuple = graph_returns_tuple(graph)

--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -29,6 +29,8 @@ class CompilationCounter:
    num_cache_entries_updated: int = 0
    # The number of standalone_compile compiled artifacts saved
    num_compiled_artifacts_saved: int = 0
+    # The number of standalone_compile compiled artifacts loaded from cache
+    num_compiled_artifacts_loaded: int = 0
    # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE
    stock_torch_compile_count: int = 0