Unverified Commit 5eeba80c authored by shanjiaz's avatar shanjiaz Committed by GitHub
Browse files

Adding optional speculator tests for larger models (#32943)


Signed-off-by: default avatarshanjiaz <zsjwpianpian@gmail.com>
parent 08b1195e
...@@ -362,7 +362,7 @@ steps: ...@@ -362,7 +362,7 @@ steps:
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
- pytest -v -s v1/spec_decode - pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
...@@ -1420,6 +1420,20 @@ steps: ...@@ -1420,6 +1420,20 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: Acceptance Length Test (Large Models) # optional
timeout_in_minutes: 120
gpu: h100
optional: true
num_gpus: 1
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/model_executor/models/mlp_speculator.py
- tests/v1/spec_decode/test_acceptance_length.py
commands:
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
gpu: a100 gpu: a100
optional: true optional: true
......
...@@ -35,6 +35,10 @@ class Eagle3ModelConfig: ...@@ -35,6 +35,10 @@ class Eagle3ModelConfig:
id: str = "" id: str = ""
# Backends that are incompatible with this model (will be skipped) # Backends that are incompatible with this model (will be skipped)
excluded_backends: set[AttentionBackendEnum] = field(default_factory=set) excluded_backends: set[AttentionBackendEnum] = field(default_factory=set)
# Pytest marks for this configuration (e.g., pytest.mark.optional)
marks: list = field(default_factory=list)
# Custom relative tolerance (defaults to DEFAULT_RTOL if None)
rtol: float | None = None
# Model configurations for EAGLE3 acceptance length tests. # Model configurations for EAGLE3 acceptance length tests.
...@@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [ ...@@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [
# FLASHINFER does not support ("sink setting not supported") # FLASHINFER does not support ("sink setting not supported")
excluded_backends={AttentionBackendEnum.FLASHINFER}, excluded_backends={AttentionBackendEnum.FLASHINFER},
), ),
Eagle3ModelConfig(
verifier="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
drafter="nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3",
expected_acceptance_length=1.35,
expected_acceptance_lengths_per_pos=[0.2900, 0.0620, 0.0115],
id="qwen3-30b-moe-vl-eagle3",
marks=[
pytest.mark.slow_test,
],
rtol=0.15, # Higher tolerance due to small absolute values at position 2
),
] ]
# Default test parameters # Default test parameters
...@@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict: ...@@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict:
@large_gpu_mark(min_gb=40) @large_gpu_mark(min_gb=40)
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="This test is only supported on CUDA platform.",
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_config", "model_config",
[pytest.param(config, id=config.id) for config in EAGLE3_MODEL_CONFIGS], [
pytest.param(config, id=config.id, marks=config.marks)
for config in EAGLE3_MODEL_CONFIGS
],
) )
@pytest.mark.parametrize("num_spec_tokens", [DEFAULT_NUM_SPEC_TOKENS]) @pytest.mark.parametrize("num_spec_tokens", [DEFAULT_NUM_SPEC_TOKENS])
@pytest.mark.parametrize("tp_size", get_tp_size_params()) @pytest.mark.parametrize("tp_size", get_tp_size_params())
...@@ -251,6 +273,7 @@ def test_eagle3_acceptance_length( ...@@ -251,6 +273,7 @@ def test_eagle3_acceptance_length(
rel_error = abs(actual_acceptance_length - expected) / expected rel_error = abs(actual_acceptance_length - expected) / expected
# Overall acceptance length always uses DEFAULT_RTOL
assert rel_error <= DEFAULT_RTOL, ( assert rel_error <= DEFAULT_RTOL, (
f"Acceptance length regression detected for {model_config.id}!\n" f"Acceptance length regression detected for {model_config.id}!\n"
f" Expected: {expected:.3f}\n" f" Expected: {expected:.3f}\n"
...@@ -261,18 +284,22 @@ def test_eagle3_acceptance_length( ...@@ -261,18 +284,22 @@ def test_eagle3_acceptance_length(
) )
if expected_per_pos and len(expected_per_pos) == len(actual_per_pos): if expected_per_pos and len(expected_per_pos) == len(actual_per_pos):
# Per-position checks use model-specific rtol if provided
rtol = (
model_config.rtol if model_config.rtol is not None else DEFAULT_RTOL
)
for pos, (actual, exp) in enumerate( for pos, (actual, exp) in enumerate(
zip(actual_per_pos, expected_per_pos) zip(actual_per_pos, expected_per_pos)
): ):
if exp > 0: if exp > 0:
pos_rel_error = abs(actual - exp) / exp pos_rel_error = abs(actual - exp) / exp
assert pos_rel_error <= DEFAULT_RTOL, ( assert pos_rel_error <= rtol, (
f"Per-position acceptance length regression at pos {pos} " f"Per-position acceptance length regression at pos {pos} "
f"for {model_config.id}!\n" f"for {model_config.id}!\n"
f" Expected: {exp:.3f}\n" f" Expected: {exp:.3f}\n"
f" Actual: {actual:.3f}\n" f" Actual: {actual:.3f}\n"
f" Relative error: {pos_rel_error:.2%} " f" Relative error: {pos_rel_error:.2%} "
f"(tolerance: {DEFAULT_RTOL:.2%})" f"(tolerance: {rtol:.2%})"
) )
print( print(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment