[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
f1816fb1 · Kevin H. Luu · GitHub · 0005d2a3 · f1816fb1 · f1816fb1
Unverified Commit f1816fb1 authored Mar 13, 2026 by Kevin H. Luu Committed by GitHub Mar 13, 2026
18 changed files
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -369,7 +369,7 @@ steps:
    - vllm/
    - tests/v1
  commands:
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"

 - label: V1 Test e2e (4 GPUs) # 65min
  timeout_in_minutes: 90
@@ -380,7 +380,7 @@ steps:
    - vllm/
    - tests/v1
  commands:
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
@@ -1744,7 +1744,7 @@ steps:
    - tests/v1
  commands:
    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"

 - label: V1 Test e2e (4 GPUs) # 65min
  timeout_in_minutes: 90
@@ -1759,7 +1759,7 @@ steps:
    - tests/v1
  commands:
    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
@@ -3494,7 +3494,7 @@ steps:
    - tests/v1
  commands:
    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"

 - label: V1 Test e2e (4 GPUs) # 65min
  timeout_in_minutes: 90
@@ -3509,7 +3509,7 @@ steps:
    - tests/v1
  commands:
    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50

--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,28 +14,30 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

- label: V1 e2e + engine (1 GPU)
-  timeout_in_minutes: 45
+- label: Engine (1 GPU)
+  timeout_in_minutes: 30
  source_file_dependencies:
-    - vllm/
-    - tests/v1
+    - vllm/v1/engine/
+    - tests/v1/engine/
  commands:
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    # Run this test standalone for now;
-    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
-    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
+
+- label: e2e Scheduling (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/
+    - tests/v1/e2e/general/
+  commands:
+    - pytest -v -s v1/e2e/general/test_async_scheduling.py
+
+- label: e2e Core (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/
+    - tests/v1/e2e/general/
  commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
+    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py

 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -46,7 +48,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
      device: mi325_2
@@ -62,7 +64,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4

--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -18,9 +18,9 @@ steps:
  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
  # This requires eager until we sort out CG correctness issues.
  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
-  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram"
-  - pytest -v -s v1/e2e/test_context_length.py
-  - pytest -v -s v1/e2e/test_min_tokens.py
+  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
+  - pytest -v -s v1/e2e/general/test_context_length.py
+  - pytest -v -s v1/e2e/general/test_min_tokens.py
  # Temporary hack filter to exclude ngram spec decoding based tests.
  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"

@@ -102,9 +102,9 @@ steps:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - tests/v1/spec_decode/test_max_len.py
-  - tests/v1/e2e/test_spec_decode.py
+  - tests/v1/e2e/spec_decode/test_spec_decode.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
-  - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp"
+  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
+group: Spec Decode
+depends_on:
+  - image-build
+steps:
+- label: Spec Decode Eagle
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
+
+- label: Spec Decode Speculators + MTP
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - vllm/transformers_utils/configs/speculators/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
+
+- label: Spec Decode Ngram + Suffix
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
+
+- label: Spec Decode Draft Model
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/tests/v1/e2e/general/__init__.py
+++ b/tests/v1/e2e/general/__init__.py
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -14,8 +14,8 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import StructuredOutputsParams
 from vllm.v1.metrics.reader import Metric

-from ...conftest import VllmRunner
-from ...models.utils import check_outputs_equal
+from ....conftest import VllmRunner
+from ....models.utils import check_outputs_equal

 MODEL = "Qwen/Qwen3-0.6B"
 MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"

--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -5,7 +5,7 @@ import pytest

 from vllm import LLM, SamplingParams

-from ...utils import create_new_process_for_each_test
+from ....utils import create_new_process_for_each_test


 @create_new_process_for_each_test()

--- a/tests/v1/e2e/test_context_length.py
+++ b/tests/v1/e2e/test_context_length.py
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -7,7 +7,7 @@ import pytest
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform

-from ...utils import check_answers, prep_prompts
+from ....utils import check_answers, prep_prompts


 @dataclass

--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -9,7 +9,7 @@ from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform

-from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
+from ....utils import check_answers, fork_new_process_for_each_test, prep_prompts

 # global seed
 SEED = 42
@@ -18,7 +18,7 @@ SEED = 42
 @pytest.fixture
 def test_prompts():
    """
-    Adapted from tests/v1/e2e/test_spec_decode.py
+    Adapted from tests/v1/e2e/spec_decode/test_spec_decode.py
    """
    prompt_types = ["repeat", "sentence"]
    # Setting higher num prompts increases the chance of numerics mismatch

--- a/tests/v1/e2e/test_mamba_prefix_cache.py
+++ b/tests/v1/e2e/test_mamba_prefix_cache.py
--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -497,6 +497,6 @@ if __name__ == "__main__":
    
    Usage:
        cd vllm/
-        python -m pytest tests/v1/e2e/test_min_tokens.py -v
+        python -m pytest tests/v1/e2e/general/test_min_tokens.py -v
    """
    pytest.main([__file__, "-v"])
--- a/tests/v1/e2e/test_pooling_chunked_prefill.py
+++ b/tests/v1/e2e/test_pooling_chunked_prefill.py
--- a/tests/v1/e2e/test_streaming_input.py
+++ b/tests/v1/e2e/test_streaming_input.py
--- a/tests/v1/e2e/spec_decode/__init__.py
+++ b/tests/v1/e2e/spec_decode/__init__.py
--- a/tests/v1/e2e/test_async_spec_decode.py
+++ b/tests/v1/e2e/test_async_spec_decode.py
--- a/tests/v1/e2e/test_lora_with_spec_decode.py
+++ b/tests/v1/e2e/test_lora_with_spec_decode.py
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py