Adding optional speculator tests for larger models (#32943)

Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>

Adding optional speculator tests for larger models (#32943)
Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
5eeba80c · shanjiaz · GitHub · 08b1195e · 5eeba80c · 5eeba80c
Unverified Commit 5eeba80c authored Jan 29, 2026 by shanjiaz Committed by GitHub Jan 29, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 45 additions and 4 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +15 -1

tests/v1/spec_decode/test_acceptance_length.py tests/v1/spec_decode/test_acceptance_length.py +30 -3

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -362,7 +362,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -1420,6 +1420,20 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
+- label: Acceptance Length Test (Large Models) # optional
+  timeout_in_minutes: 120
+  gpu: h100
+  optional: true
+  num_gpus: 1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  commands:
+    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true

--- a/tests/v1/spec_decode/test_acceptance_length.py
+++ b/tests/v1/spec_decode/test_acceptance_length.py
@@ -35,6 +35,10 @@ class Eagle3ModelConfig:
    id: str = ""
    # Backends that are incompatible with this model (will be skipped)
    excluded_backends: set[AttentionBackendEnum] = field(default_factory=set)
+    # Pytest marks for this configuration (e.g., pytest.mark.optional)
+    marks: list = field(default_factory=list)
+    # Custom relative tolerance (defaults to DEFAULT_RTOL if None)
+    rtol: float | None = None
 # Model configurations for EAGLE3 acceptance length tests.
@@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [
        # FLASHINFER does not support ("sink setting not supported")
        excluded_backends={AttentionBackendEnum.FLASHINFER},
    ),
+    Eagle3ModelConfig(
+        verifier="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
+        drafter="nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3",
+        expected_acceptance_length=1.35,
+        expected_acceptance_lengths_per_pos=[0.2900, 0.0620, 0.0115],
+        id="qwen3-30b-moe-vl-eagle3",
+        marks=[
+            pytest.mark.slow_test,
+        ],
+        rtol=0.15,  # Higher tolerance due to small absolute values at position 2
+    ),
 ]
 # Default test parameters
@@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict:
 @large_gpu_mark(min_gb=40)
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="This test is only supported on CUDA platform.",
+)
 @pytest.mark.parametrize(
    "model_config",
-    [pytest.param(config, id=config.id) for config in EAGLE3_MODEL_CONFIGS],
+    [
+        pytest.param(config, id=config.id, marks=config.marks)
+        for config in EAGLE3_MODEL_CONFIGS
+    ],
 )
 @pytest.mark.parametrize("num_spec_tokens", [DEFAULT_NUM_SPEC_TOKENS])
 @pytest.mark.parametrize("tp_size", get_tp_size_params())
@@ -251,6 +273,7 @@ def test_eagle3_acceptance_length(
            rel_error = abs(actual_acceptance_length - expected) / expected
+            # Overall acceptance length always uses DEFAULT_RTOL
            assert rel_error <= DEFAULT_RTOL, (
                f"Acceptance length regression detected for {model_config.id}!\n"
                f"  Expected: {expected:.3f}\n"
@@ -261,18 +284,22 @@ def test_eagle3_acceptance_length(
            )
            if expected_per_pos and len(expected_per_pos) == len(actual_per_pos):
+                # Per-position checks use model-specific rtol if provided
+                rtol = (
+                    model_config.rtol if model_config.rtol is not None else DEFAULT_RTOL
+                )
                for pos, (actual, exp) in enumerate(
                    zip(actual_per_pos, expected_per_pos)
                ):
                    if exp > 0:
                        pos_rel_error = abs(actual - exp) / exp
-                        assert pos_rel_error <= DEFAULT_RTOL, (
+                        assert pos_rel_error <= rtol, (
                            f"Per-position acceptance length regression at pos {pos} "
                            f"for {model_config.id}!\n"
                            f"  Expected: {exp:.3f}\n"
                            f"  Actual:   {actual:.3f}\n"
                            f"  Relative error: {pos_rel_error:.2%} "
-                            f"(tolerance: {DEFAULT_RTOL:.2%})"
+                            f"(tolerance: {rtol:.2%})"
                        )
            print(