Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5eeba80c
Unverified
Commit
5eeba80c
authored
Jan 29, 2026
by
shanjiaz
Committed by
GitHub
Jan 29, 2026
Browse files
Adding optional speculator tests for larger models (#32943)
Signed-off-by:
shanjiaz
<
zsjwpianpian@gmail.com
>
parent
08b1195e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
45 additions
and
4 deletions
+45
-4
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+15
-1
tests/v1/spec_decode/test_acceptance_length.py
tests/v1/spec_decode/test_acceptance_length.py
+30
-3
No files found.
.buildkite/test-pipeline.yaml
View file @
5eeba80c
...
@@ -362,7 +362,7 @@ steps:
...
@@ -362,7 +362,7 @@ steps:
-
pytest -v -s v1/sample
-
pytest -v -s v1/sample
-
pytest -v -s v1/logits_processors
-
pytest -v -s v1/logits_processors
-
pytest -v -s v1/worker
-
pytest -v -s v1/worker
-
pytest -v -s v1/spec_decode
-
pytest -v -s
-m 'not slow_test'
v1/spec_decode
-
pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-
pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-
pytest -v -s -m 'not cpu_test' v1/metrics
-
pytest -v -s -m 'not cpu_test' v1/metrics
-
pytest -v -s v1/test_oracle.py
-
pytest -v -s v1/test_oracle.py
...
@@ -1420,6 +1420,20 @@ steps:
...
@@ -1420,6 +1420,20 @@ steps:
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
pytest -v -s -x lora/test_mixtral.py
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
Acceptance Length Test (Large Models)
# optional
timeout_in_minutes
:
120
gpu
:
h100
optional
:
true
num_gpus
:
1
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/v1/spec_decode/
-
vllm/model_executor/models/mlp_speculator.py
-
tests/v1/spec_decode/test_acceptance_length.py
commands
:
-
export VLLM_ALLOW_INSECURE_SERIALIZATION=1
-
pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
-
label
:
LM Eval Large Models
# optional
-
label
:
LM Eval Large Models
# optional
gpu
:
a100
gpu
:
a100
optional
:
true
optional
:
true
...
...
tests/v1/spec_decode/test_acceptance_length.py
View file @
5eeba80c
...
@@ -35,6 +35,10 @@ class Eagle3ModelConfig:
...
@@ -35,6 +35,10 @@ class Eagle3ModelConfig:
id
:
str
=
""
id
:
str
=
""
# Backends that are incompatible with this model (will be skipped)
# Backends that are incompatible with this model (will be skipped)
excluded_backends
:
set
[
AttentionBackendEnum
]
=
field
(
default_factory
=
set
)
excluded_backends
:
set
[
AttentionBackendEnum
]
=
field
(
default_factory
=
set
)
# Pytest marks for this configuration (e.g., pytest.mark.optional)
marks
:
list
=
field
(
default_factory
=
list
)
# Custom relative tolerance (defaults to DEFAULT_RTOL if None)
rtol
:
float
|
None
=
None
# Model configurations for EAGLE3 acceptance length tests.
# Model configurations for EAGLE3 acceptance length tests.
...
@@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [
...
@@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [
# FLASHINFER does not support ("sink setting not supported")
# FLASHINFER does not support ("sink setting not supported")
excluded_backends
=
{
AttentionBackendEnum
.
FLASHINFER
},
excluded_backends
=
{
AttentionBackendEnum
.
FLASHINFER
},
),
),
Eagle3ModelConfig
(
verifier
=
"Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
,
drafter
=
"nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3"
,
expected_acceptance_length
=
1.35
,
expected_acceptance_lengths_per_pos
=
[
0.2900
,
0.0620
,
0.0115
],
id
=
"qwen3-30b-moe-vl-eagle3"
,
marks
=
[
pytest
.
mark
.
slow_test
,
],
rtol
=
0.15
,
# Higher tolerance due to small absolute values at position 2
),
]
]
# Default test parameters
# Default test parameters
...
@@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict:
...
@@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict:
@
large_gpu_mark
(
min_gb
=
40
)
@
large_gpu_mark
(
min_gb
=
40
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"This test is only supported on CUDA platform."
,
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_config"
,
"model_config"
,
[
pytest
.
param
(
config
,
id
=
config
.
id
)
for
config
in
EAGLE3_MODEL_CONFIGS
],
[
pytest
.
param
(
config
,
id
=
config
.
id
,
marks
=
config
.
marks
)
for
config
in
EAGLE3_MODEL_CONFIGS
],
)
)
@
pytest
.
mark
.
parametrize
(
"num_spec_tokens"
,
[
DEFAULT_NUM_SPEC_TOKENS
])
@
pytest
.
mark
.
parametrize
(
"num_spec_tokens"
,
[
DEFAULT_NUM_SPEC_TOKENS
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
get_tp_size_params
())
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
get_tp_size_params
())
...
@@ -251,6 +273,7 @@ def test_eagle3_acceptance_length(
...
@@ -251,6 +273,7 @@ def test_eagle3_acceptance_length(
rel_error
=
abs
(
actual_acceptance_length
-
expected
)
/
expected
rel_error
=
abs
(
actual_acceptance_length
-
expected
)
/
expected
# Overall acceptance length always uses DEFAULT_RTOL
assert
rel_error
<=
DEFAULT_RTOL
,
(
assert
rel_error
<=
DEFAULT_RTOL
,
(
f
"Acceptance length regression detected for
{
model_config
.
id
}
!
\n
"
f
"Acceptance length regression detected for
{
model_config
.
id
}
!
\n
"
f
" Expected:
{
expected
:.
3
f
}
\n
"
f
" Expected:
{
expected
:.
3
f
}
\n
"
...
@@ -261,18 +284,22 @@ def test_eagle3_acceptance_length(
...
@@ -261,18 +284,22 @@ def test_eagle3_acceptance_length(
)
)
if
expected_per_pos
and
len
(
expected_per_pos
)
==
len
(
actual_per_pos
):
if
expected_per_pos
and
len
(
expected_per_pos
)
==
len
(
actual_per_pos
):
# Per-position checks use model-specific rtol if provided
rtol
=
(
model_config
.
rtol
if
model_config
.
rtol
is
not
None
else
DEFAULT_RTOL
)
for
pos
,
(
actual
,
exp
)
in
enumerate
(
for
pos
,
(
actual
,
exp
)
in
enumerate
(
zip
(
actual_per_pos
,
expected_per_pos
)
zip
(
actual_per_pos
,
expected_per_pos
)
):
):
if
exp
>
0
:
if
exp
>
0
:
pos_rel_error
=
abs
(
actual
-
exp
)
/
exp
pos_rel_error
=
abs
(
actual
-
exp
)
/
exp
assert
pos_rel_error
<=
DEFAULT_RTOL
,
(
assert
pos_rel_error
<=
rtol
,
(
f
"Per-position acceptance length regression at pos
{
pos
}
"
f
"Per-position acceptance length regression at pos
{
pos
}
"
f
"for
{
model_config
.
id
}
!
\n
"
f
"for
{
model_config
.
id
}
!
\n
"
f
" Expected:
{
exp
:.
3
f
}
\n
"
f
" Expected:
{
exp
:.
3
f
}
\n
"
f
" Actual:
{
actual
:.
3
f
}
\n
"
f
" Actual:
{
actual
:.
3
f
}
\n
"
f
" Relative error:
{
pos_rel_error
:.
2
%
}
"
f
" Relative error:
{
pos_rel_error
:.
2
%
}
"
f
"(tolerance:
{
DEFAULT_RTOL
:.
2
%
}
)"
f
"(tolerance:
{
rtol
:.
2
%
}
)"
)
)
print
(
print
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment