Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ee1531bc
Unverified
Commit
ee1531bc
authored
Jun 15, 2025
by
Lu Fang
Committed by
GitHub
Jun 14, 2025
Browse files
[Bugfix][2/n] Fix speculative decoding CI - Fix test_ngram_e2e_greedy_correctness (#19644)
parent
e13945f9
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
50 additions
and
3 deletions
+50
-3
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+10
-1
tests/spec_decode/e2e/test_logprobs.py
tests/spec_decode/e2e/test_logprobs.py
+17
-1
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+3
-0
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+18
-0
vllm/model_executor/models/eagle.py
vllm/model_executor/models/eagle.py
+2
-1
No files found.
tests/spec_decode/e2e/test_integration.py
View file @
ee1531bc
...
@@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m"
...
@@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m"
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"model_name"
:
"JackFram/llama-68m"
,
# Verify equality when cuda graphs allowed.
# Verify equality when cuda graphs allowed.
"enforce_eager"
:
False
,
"enforce_eager"
:
False
,
"model_name"
:
"JackFram/llama-68m"
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
...
@@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
...
@@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_logprobs.py
View file @
ee1531bc
...
@@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test
...
@@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test
"model_name"
:
"JackFram/llama-160m"
,
"model_name"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
...
@@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
...
@@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
...
@@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
...
@@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"model_name"
:
"JackFram/llama-160m"
,
"model_name"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
ee1531bc
...
@@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
ee1531bc
...
@@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test
...
@@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test
# Print spec metrics.
# Print spec metrics.
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Print spec metrics.
# Print spec metrics.
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
...
@@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
...
@@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
...
@@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
vllm/model_executor/models/eagle.py
View file @
ee1531bc
...
@@ -74,6 +74,7 @@ class EAGLE(nn.Module):
...
@@ -74,6 +74,7 @@ class EAGLE(nn.Module):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
=
vllm_config
.
model_config
.
hf_config
self
.
dtype
=
vllm_config
.
model_config
.
dtype
self
.
config
=
config
self
.
config
=
config
architectures
=
getattr
(
self
.
config
.
model
,
"architectures"
,
[])
architectures
=
getattr
(
self
.
config
.
model
,
"architectures"
,
[])
...
@@ -250,7 +251,7 @@ class EAGLE(nn.Module):
...
@@ -250,7 +251,7 @@ class EAGLE(nn.Module):
lm_head_weight
=
torch
.
zeros
(
lm_head_weight
=
torch
.
zeros
(
self
.
lm_head
.
org_vocab_size
,
self
.
lm_head
.
org_vocab_size
,
self
.
lm_head
.
embedding_dim
,
self
.
lm_head
.
embedding_dim
,
dtype
=
self
.
config
.
torch_
dtype
,
dtype
=
self
.
dtype
,
)
)
weight_loader
=
getattr
(
self
.
lm_head
.
weight
,
"weight_loader"
,
weight_loader
=
getattr
(
self
.
lm_head
.
weight
,
"weight_loader"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment