Unverified Commit 6e37c46b authored by Richard Zou's avatar Richard Zou Committed by GitHub
Browse files

[compile] Add some more startup tests for top models (#38046)


Signed-off-by: default avatarRichard Zou <zou3519@gmail.com>
parent 1bf2ddd0
......@@ -17,6 +17,16 @@ steps:
# (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Compilation Unit Tests (H100)
timeout_in_minutes: 30
device: h100
num_devices: 1
source_file_dependencies:
- vllm/
- tests/compile/h100/
commands:
- "find compile/h100/ -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Compilation Passes Unit Tests
timeout_in_minutes: 20
source_file_dependencies:
......
......@@ -8,16 +8,17 @@ then runs in the parent with clean in-memory state but populated caches.
"""
import multiprocessing as mp
from typing import NamedTuple
import pytest
from torch._dynamo.utils import counters
import vllm.envs as envs
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..utils import fork_new_process_for_each_test
from ...utils import fork_new_process_for_each_test
MODEL = "microsoft/Phi-tiny-MoE-instruct"
......@@ -85,3 +86,164 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifa
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0
) # No miss at aot_autograd level causing disk I/O.
# ---------------------------------------------------------------------------
# Parametrized model startup tests
# ---------------------------------------------------------------------------
class ModelStartupSpec(NamedTuple):
model: str
hf_overrides: dict
cold_artifacts_saved: int
warm_artifacts_saved: int
warm_artifacts_loaded: int
_SMALL_MOE_OVERRIDES = {
"num_hidden_layers": 8,
"hidden_size": 256,
"intermediate_size": 512,
"num_attention_heads": 8,
"num_key_value_heads": 1,
"n_routed_experts": 8,
}
MODEL_SPECS = [
pytest.param(
ModelStartupSpec(
model="openai/gpt-oss-120b",
hf_overrides={
"num_hidden_layers": 8,
"hidden_size": 256,
"intermediate_size": 512,
"num_attention_heads": 8,
"num_key_value_heads": 1,
"num_local_experts": 8,
},
cold_artifacts_saved=3,
warm_artifacts_saved=0,
warm_artifacts_loaded=3,
),
id="gpt_oss_120b",
),
# NOTE: DeepSeek-V3.2 requires sparse MLA (index_topk) which needs
# Hopper+ GPUs. This test must run on H100 (see pytorch.yaml).
pytest.param(
ModelStartupSpec(
model="deepseek-ai/DeepSeek-V3.2",
hf_overrides=_SMALL_MOE_OVERRIDES,
cold_artifacts_saved=4,
# TODO: https://github.com/vllm-project/vllm/issues/38051
# We shouldn't be saving any artifacts on warm start.
warm_artifacts_saved=4,
warm_artifacts_loaded=0,
),
id="deepseek_v3.2",
),
pytest.param(
ModelStartupSpec(
model="moonshotai/Kimi-K2.5",
hf_overrides={"text_config": _SMALL_MOE_OVERRIDES},
cold_artifacts_saved=4,
# TODO: https://github.com/vllm-project/vllm/issues/38051
# We shouldn't be saving any artifacts on warm start.
warm_artifacts_saved=4,
warm_artifacts_loaded=0,
),
id="kimi_k2.5",
),
pytest.param(
ModelStartupSpec(
model="zai-org/GLM-4.5",
hf_overrides=_SMALL_MOE_OVERRIDES,
cold_artifacts_saved=4,
warm_artifacts_saved=0,
warm_artifacts_loaded=4,
),
id="glm_4.5",
),
pytest.param(
ModelStartupSpec(
model="MiniMaxAI/MiniMax-M2.5",
hf_overrides=_SMALL_MOE_OVERRIDES,
cold_artifacts_saved=3,
warm_artifacts_saved=0,
warm_artifacts_loaded=3,
),
id="minimax_m2.5",
),
]
def _run_model(vllm_runner, spec: ModelStartupSpec):
with vllm_runner(
spec.model,
trust_remote_code=True,
max_model_len=256,
max_num_batched_tokens=1024,
block_size=64,
load_format="dummy",
hf_overrides=spec.hf_overrides,
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE,
pass_config=PassConfig(fuse_allreduce_rms=False),
),
num_gpu_blocks_override=8,
):
pass
def _check_model_run(vllm_runner, spec: ModelStartupSpec, is_cold_start: bool):
"""Runs a model and checks the number of compiled artifacts."""
old = compilation_counter.clone()
_run_model(vllm_runner, spec)
saved = (
compilation_counter.num_compiled_artifacts_saved
- old.num_compiled_artifacts_saved
)
loaded = (
compilation_counter.num_compiled_artifacts_loaded
- old.num_compiled_artifacts_loaded
)
start_type = "COLD" if is_cold_start else "WARM"
# Print actual values for debugging — intentional, helps diagnose
# failures and calibrate expected counts when adding new models.
print(f"\n=== {start_type} START for {spec.model} ===")
print(f" num_compiled_artifacts_saved={saved}")
print(f" num_compiled_artifacts_loaded={loaded}")
if is_cold_start:
expected_saved = spec.cold_artifacts_saved
expected_loaded = 0
else:
expected_saved = spec.warm_artifacts_saved
expected_loaded = spec.warm_artifacts_loaded
assert saved == expected_saved, f"{start_type.lower()}_artifacts_saved: got {saved}"
assert loaded == expected_loaded, (
f"{start_type.lower()}_artifacts_loaded: got {loaded}"
)
def _cold_start_model(vllm_runner, spec: ModelStartupSpec):
_check_model_run(vllm_runner, spec, is_cold_start=True)
@pytest.mark.parametrize("spec", MODEL_SPECS)
@fork_new_process_for_each_test
def test_model_startup(monkeypatch, vllm_runner, fresh_vllm_cache, spec):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
# Cold start in a forked child (must fork before CUDA init).
ctx = mp.get_context("fork")
p = ctx.Process(target=_cold_start_model, args=(vllm_runner, spec))
p.start()
p.join()
assert p.exitcode == 0, "Cold-start child failed"
# Warm start — compiled artifacts loaded from disk cache.
_check_model_run(vllm_runner, spec, is_cold_start=False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment