Unverified Commit f1816fb1 authored by Kevin H. Luu's avatar Kevin H. Luu Committed by GitHub
Browse files

[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)


Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 0005d2a3
......@@ -369,7 +369,7 @@ steps:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90
......@@ -380,7 +380,7 @@ steps:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
......@@ -1744,7 +1744,7 @@ steps:
- tests/v1
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90
......@@ -1759,7 +1759,7 @@ steps:
- tests/v1
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
......@@ -3494,7 +3494,7 @@ steps:
- tests/v1
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90
......@@ -3509,7 +3509,7 @@ steps:
- tests/v1
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
......
......@@ -14,28 +14,30 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine (1 GPU)
timeout_in_minutes: 45
- label: Engine (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/v1
- vllm/v1/engine/
- tests/v1/engine/
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
......@@ -46,7 +48,7 @@ steps:
- tests/v1/e2e
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
......@@ -62,7 +64,7 @@ steps:
- tests/v1/e2e
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
mirror:
amd:
device: mi325_4
......
......@@ -18,9 +18,9 @@ steps:
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
# This requires eager until we sort out CG correctness issues.
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/test_context_length.py
- pytest -v -s v1/e2e/test_min_tokens.py
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
......@@ -102,9 +102,9 @@ steps:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/v1/spec_decode/test_max_len.py
- tests/v1/e2e/test_spec_decode.py
- tests/v1/e2e/spec_decode/test_spec_decode.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
group: Spec Decode
depends_on:
- image-build
steps:
- label: Spec Decode Eagle
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
- label: Spec Decode Speculators + MTP
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- vllm/transformers_utils/configs/speculators/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
- label: Spec Decode Ngram + Suffix
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
- label: Spec Decode Draft Model
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
......@@ -14,8 +14,8 @@ from vllm.platforms import current_platform
from vllm.sampling_params import StructuredOutputsParams
from vllm.v1.metrics.reader import Metric
from ...conftest import VllmRunner
from ...models.utils import check_outputs_equal
from ....conftest import VllmRunner
from ....models.utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B"
MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
......
......@@ -5,7 +5,7 @@ import pytest
from vllm import LLM, SamplingParams
from ...utils import create_new_process_for_each_test
from ....utils import create_new_process_for_each_test
@create_new_process_for_each_test()
......
......@@ -7,7 +7,7 @@ import pytest
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
from ...utils import check_answers, prep_prompts
from ....utils import check_answers, prep_prompts
@dataclass
......
......@@ -9,7 +9,7 @@ from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationMode
from vllm.platforms import current_platform
from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
from ....utils import check_answers, fork_new_process_for_each_test, prep_prompts
# global seed
SEED = 42
......@@ -18,7 +18,7 @@ SEED = 42
@pytest.fixture
def test_prompts():
"""
Adapted from tests/v1/e2e/test_spec_decode.py
Adapted from tests/v1/e2e/spec_decode/test_spec_decode.py
"""
prompt_types = ["repeat", "sentence"]
# Setting higher num prompts increases the chance of numerics mismatch
......
......@@ -497,6 +497,6 @@ if __name__ == "__main__":
Usage:
cd vllm/
python -m pytest tests/v1/e2e/test_min_tokens.py -v
python -m pytest tests/v1/e2e/general/test_min_tokens.py -v
"""
pytest.main([__file__, "-v"])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment