[compile] Add some more startup tests for top models (#38046)

Signed-off-by: Richard Zou <zou3519@gmail.com>

[compile] Add some more startup tests for top models (#38046)
Signed-off-by: Richard Zou <zou3519@gmail.com>
6e37c46b · Richard Zou · GitHub · 1bf2ddd0 · 6e37c46b · 6e37c46b
Unverified Commit 6e37c46b authored Mar 25, 2026 by Richard Zou Committed by GitHub Mar 25, 2026
3 changed files
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -17,6 +17,16 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

+- label: PyTorch Compilation Unit Tests (H100)
+  timeout_in_minutes: 30
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - vllm/
+    - tests/compile/h100/
+  commands:
+  - "find compile/h100/ -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+
 - label: PyTorch Compilation Passes Unit Tests
  timeout_in_minutes: 20
  source_file_dependencies:

--- a/tests/compile/h100/__init__.py
+++ b/tests/compile/h100/__init__.py
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -8,16 +8,17 @@ then runs in the parent with clean in-memory state but populated caches.
 """

 import multiprocessing as mp
+from typing import NamedTuple

 import pytest
 from torch._dynamo.utils import counters

 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
-from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.utils.torch_utils import is_torch_equal_or_newer

-from ..utils import fork_new_process_for_each_test
+from ...utils import fork_new_process_for_each_test

 MODEL = "microsoft/Phi-tiny-MoE-instruct"

@@ -85,3 +86,164 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifa
    assert (
        counters["aot_autograd"]["autograd_cache_hit"] == 0
    )  # No miss at aot_autograd level causing disk I/O.
+
+
+# ---------------------------------------------------------------------------
+# Parametrized model startup tests
+# ---------------------------------------------------------------------------
+
+
+class ModelStartupSpec(NamedTuple):
+    model: str
+    hf_overrides: dict
+    cold_artifacts_saved: int
+    warm_artifacts_saved: int
+    warm_artifacts_loaded: int
+
+
+_SMALL_MOE_OVERRIDES = {
+    "num_hidden_layers": 8,
+    "hidden_size": 256,
+    "intermediate_size": 512,
+    "num_attention_heads": 8,
+    "num_key_value_heads": 1,
+    "n_routed_experts": 8,
+}
+
+MODEL_SPECS = [
+    pytest.param(
+        ModelStartupSpec(
+            model="openai/gpt-oss-120b",
+            hf_overrides={
+                "num_hidden_layers": 8,
+                "hidden_size": 256,
+                "intermediate_size": 512,
+                "num_attention_heads": 8,
+                "num_key_value_heads": 1,
+                "num_local_experts": 8,
+            },
+            cold_artifacts_saved=3,
+            warm_artifacts_saved=0,
+            warm_artifacts_loaded=3,
+        ),
+        id="gpt_oss_120b",
+    ),
+    # NOTE: DeepSeek-V3.2 requires sparse MLA (index_topk) which needs
+    # Hopper+ GPUs. This test must run on H100 (see pytorch.yaml).
+    pytest.param(
+        ModelStartupSpec(
+            model="deepseek-ai/DeepSeek-V3.2",
+            hf_overrides=_SMALL_MOE_OVERRIDES,
+            cold_artifacts_saved=4,
+            # TODO: https://github.com/vllm-project/vllm/issues/38051
+            # We shouldn't be saving any artifacts on warm start.
+            warm_artifacts_saved=4,
+            warm_artifacts_loaded=0,
+        ),
+        id="deepseek_v3.2",
+    ),
+    pytest.param(
+        ModelStartupSpec(
+            model="moonshotai/Kimi-K2.5",
+            hf_overrides={"text_config": _SMALL_MOE_OVERRIDES},
+            cold_artifacts_saved=4,
+            # TODO: https://github.com/vllm-project/vllm/issues/38051
+            # We shouldn't be saving any artifacts on warm start.
+            warm_artifacts_saved=4,
+            warm_artifacts_loaded=0,
+        ),
+        id="kimi_k2.5",
+    ),
+    pytest.param(
+        ModelStartupSpec(
+            model="zai-org/GLM-4.5",
+            hf_overrides=_SMALL_MOE_OVERRIDES,
+            cold_artifacts_saved=4,
+            warm_artifacts_saved=0,
+            warm_artifacts_loaded=4,
+        ),
+        id="glm_4.5",
+    ),
+    pytest.param(
+        ModelStartupSpec(
+            model="MiniMaxAI/MiniMax-M2.5",
+            hf_overrides=_SMALL_MOE_OVERRIDES,
+            cold_artifacts_saved=3,
+            warm_artifacts_saved=0,
+            warm_artifacts_loaded=3,
+        ),
+        id="minimax_m2.5",
+    ),
+]
+
+
+def _run_model(vllm_runner, spec: ModelStartupSpec):
+    with vllm_runner(
+        spec.model,
+        trust_remote_code=True,
+        max_model_len=256,
+        max_num_batched_tokens=1024,
+        block_size=64,
+        load_format="dummy",
+        hf_overrides=spec.hf_overrides,
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode=CUDAGraphMode.NONE,
+            pass_config=PassConfig(fuse_allreduce_rms=False),
+        ),
+        num_gpu_blocks_override=8,
+    ):
+        pass
+
+
+def _check_model_run(vllm_runner, spec: ModelStartupSpec, is_cold_start: bool):
+    """Runs a model and checks the number of compiled artifacts."""
+    old = compilation_counter.clone()
+    _run_model(vllm_runner, spec)
+    saved = (
+        compilation_counter.num_compiled_artifacts_saved
+        - old.num_compiled_artifacts_saved
+    )
+    loaded = (
+        compilation_counter.num_compiled_artifacts_loaded
+        - old.num_compiled_artifacts_loaded
+    )
+
+    start_type = "COLD" if is_cold_start else "WARM"
+    # Print actual values for debugging — intentional, helps diagnose
+    # failures and calibrate expected counts when adding new models.
+    print(f"\n=== {start_type} START for {spec.model} ===")
+    print(f"  num_compiled_artifacts_saved={saved}")
+    print(f"  num_compiled_artifacts_loaded={loaded}")
+
+    if is_cold_start:
+        expected_saved = spec.cold_artifacts_saved
+        expected_loaded = 0
+    else:
+        expected_saved = spec.warm_artifacts_saved
+        expected_loaded = spec.warm_artifacts_loaded
+
+    assert saved == expected_saved, f"{start_type.lower()}_artifacts_saved: got {saved}"
+    assert loaded == expected_loaded, (
+        f"{start_type.lower()}_artifacts_loaded: got {loaded}"
+    )
+
+
+def _cold_start_model(vllm_runner, spec: ModelStartupSpec):
+    _check_model_run(vllm_runner, spec, is_cold_start=True)
+
+
+@pytest.mark.parametrize("spec", MODEL_SPECS)
+@fork_new_process_for_each_test
+def test_model_startup(monkeypatch, vllm_runner, fresh_vllm_cache, spec):
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    # Cold start in a forked child (must fork before CUDA init).
+    ctx = mp.get_context("fork")
+    p = ctx.Process(target=_cold_start_model, args=(vllm_runner, spec))
+    p.start()
+    p.join()
+    assert p.exitcode == 0, "Cold-start child failed"
+
+    # Warm start — compiled artifacts loaded from disk cache.
+    _check_model_run(vllm_runner, spec, is_cold_start=False)