Unverified Commit f1816fb1 authored by Kevin H. Luu's avatar Kevin H. Luu Committed by GitHub
Browse files

[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)


Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 0005d2a3
...@@ -369,7 +369,7 @@ steps: ...@@ -369,7 +369,7 @@ steps:
- vllm/ - vllm/
- tests/v1 - tests/v1
commands: commands:
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min - label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90 timeout_in_minutes: 90
...@@ -380,7 +380,7 @@ steps: ...@@ -380,7 +380,7 @@ steps:
- vllm/ - vllm/
- tests/v1 - tests/v1
commands: commands:
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
...@@ -1744,7 +1744,7 @@ steps: ...@@ -1744,7 +1744,7 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# Only run tests that need exactly 2 GPUs # Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min - label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90 timeout_in_minutes: 90
...@@ -1759,7 +1759,7 @@ steps: ...@@ -1759,7 +1759,7 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# Only run tests that need 4 GPUs # Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
...@@ -3494,7 +3494,7 @@ steps: ...@@ -3494,7 +3494,7 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# Only run tests that need exactly 2 GPUs # Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min - label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90 timeout_in_minutes: 90
...@@ -3509,7 +3509,7 @@ steps: ...@@ -3509,7 +3509,7 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# Only run tests that need 4 GPUs # Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
......
group: Engine group: Engine
depends_on: depends_on:
- image-build - image-build
steps: steps:
- label: Engine - label: Engine
...@@ -14,28 +14,30 @@ steps: ...@@ -14,28 +14,30 @@ steps:
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine (1 GPU) - label: Engine (1 GPU)
timeout_in_minutes: 45 timeout_in_minutes: 30
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/v1/engine/
- tests/v1 - tests/v1/engine/
commands: commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
mirror:
amd: - label: e2e Scheduling (1 GPU)
device: mi325_1 timeout_in_minutes: 30
depends_on: source_file_dependencies:
- image-build-amd - vllm/v1/
commands: - tests/v1/e2e/general/
- pytest -v -s v1/e2e commands:
- pytest -v -s v1/engine - pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
- label: V1 e2e (2 GPUs) - label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
...@@ -46,7 +48,7 @@ steps: ...@@ -46,7 +48,7 @@ steps:
- tests/v1/e2e - tests/v1/e2e
commands: commands:
# Only run tests that need exactly 2 GPUs # Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror: mirror:
amd: amd:
device: mi325_2 device: mi325_2
...@@ -62,7 +64,7 @@ steps: ...@@ -62,7 +64,7 @@ steps:
- tests/v1/e2e - tests/v1/e2e
commands: commands:
# Only run tests that need 4 GPUs # Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
mirror: mirror:
amd: amd:
device: mi325_4 device: mi325_4
......
...@@ -18,9 +18,9 @@ steps: ...@@ -18,9 +18,9 @@ steps:
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics" - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
# This requires eager until we sort out CG correctness issues. # This requires eager until we sort out CG correctness issues.
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged. # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram" - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/test_context_length.py - pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/test_min_tokens.py - pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests. # Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
...@@ -102,9 +102,9 @@ steps: ...@@ -102,9 +102,9 @@ steps:
- vllm/v1/worker/gpu/ - vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py - vllm/v1/worker/gpu_worker.py
- tests/v1/spec_decode/test_max_len.py - tests/v1/spec_decode/test_max_len.py
- tests/v1/e2e/test_spec_decode.py - tests/v1/e2e/spec_decode/test_spec_decode.py
commands: commands:
- set -x - set -x
- export VLLM_USE_V2_MODEL_RUNNER=1 - export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp" - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp" - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
group: Spec Decode
depends_on:
- image-build
steps:
- label: Spec Decode Eagle
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
- label: Spec Decode Speculators + MTP
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- vllm/transformers_utils/configs/speculators/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
- label: Spec Decode Ngram + Suffix
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
- label: Spec Decode Draft Model
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
...@@ -14,8 +14,8 @@ from vllm.platforms import current_platform ...@@ -14,8 +14,8 @@ from vllm.platforms import current_platform
from vllm.sampling_params import StructuredOutputsParams from vllm.sampling_params import StructuredOutputsParams
from vllm.v1.metrics.reader import Metric from vllm.v1.metrics.reader import Metric
from ...conftest import VllmRunner from ....conftest import VllmRunner
from ...models.utils import check_outputs_equal from ....models.utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B" MODEL = "Qwen/Qwen3-0.6B"
MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct" MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
......
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import create_new_process_for_each_test from ....utils import create_new_process_for_each_test
@create_new_process_for_each_test() @create_new_process_for_each_test()
......
...@@ -7,7 +7,7 @@ import pytest ...@@ -7,7 +7,7 @@ import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import check_answers, prep_prompts from ....utils import check_answers, prep_prompts
@dataclass @dataclass
......
...@@ -9,7 +9,7 @@ from vllm import LLM, SamplingParams ...@@ -9,7 +9,7 @@ from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationMode from vllm.config import CompilationConfig, CompilationMode
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts from ....utils import check_answers, fork_new_process_for_each_test, prep_prompts
# global seed # global seed
SEED = 42 SEED = 42
...@@ -18,7 +18,7 @@ SEED = 42 ...@@ -18,7 +18,7 @@ SEED = 42
@pytest.fixture @pytest.fixture
def test_prompts(): def test_prompts():
""" """
Adapted from tests/v1/e2e/test_spec_decode.py Adapted from tests/v1/e2e/spec_decode/test_spec_decode.py
""" """
prompt_types = ["repeat", "sentence"] prompt_types = ["repeat", "sentence"]
# Setting higher num prompts increases the chance of numerics mismatch # Setting higher num prompts increases the chance of numerics mismatch
......
...@@ -497,6 +497,6 @@ if __name__ == "__main__": ...@@ -497,6 +497,6 @@ if __name__ == "__main__":
Usage: Usage:
cd vllm/ cd vllm/
python -m pytest tests/v1/e2e/test_min_tokens.py -v python -m pytest tests/v1/e2e/general/test_min_tokens.py -v
""" """
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment