Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b74d888c
Unverified
Commit
b74d888c
authored
May 01, 2025
by
Huy Do
Committed by
GitHub
May 01, 2025
Browse files
Fix more broken speculative decode tests (#17450)
Signed-off-by:
Huy Do
<
huydhn@gmail.com
>
parent
2007d4d5
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
9 additions
and
4 deletions
+9
-4
tests/spec_decode/e2e/test_medusa_correctness.py
tests/spec_decode/e2e/test_medusa_correctness.py
+1
-1
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+2
-2
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+1
-1
vllm/spec_decode/multi_step_worker.py
vllm/spec_decode/multi_step_worker.py
+5
-0
No files found.
tests/spec_decode/e2e/test_medusa_correctness.py
View file @
b74d888c
...
@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
...
@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
b74d888c
...
@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
...
@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
...
@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
b74d888c
...
@@ -152,7 +152,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
...
@@ -152,7 +152,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
vllm/spec_decode/multi_step_worker.py
View file @
b74d888c
...
@@ -51,9 +51,14 @@ class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
...
@@ -51,9 +51,14 @@ class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
def
set_include_gpu_probs_tensor
(
self
)
->
None
:
def
set_include_gpu_probs_tensor
(
self
)
->
None
:
# Need include_gpu_probs_tensor for MultiStepWorker
# Need include_gpu_probs_tensor for MultiStepWorker
self
.
model_runner
.
sampler
.
include_gpu_probs_tensor
=
True
self
.
model_runner
.
sampler
.
include_gpu_probs_tensor
=
True
if
hasattr
(
self
.
model_runner
.
model
,
"sampler"
):
(
self
.
model_runner
.
model
.
sampler
.
include_gpu_probs_tensor
)
=
True
def
set_should_modify_greedy_probs_inplace
(
self
)
->
None
:
def
set_should_modify_greedy_probs_inplace
(
self
)
->
None
:
self
.
model_runner
.
sampler
.
should_modify_greedy_probs_inplace
=
True
self
.
model_runner
.
sampler
.
should_modify_greedy_probs_inplace
=
True
if
hasattr
(
self
.
model_runner
.
model
,
"sampler"
):
(
self
.
model_runner
.
model
.
sampler
.
should_modify_greedy_probs_inplace
)
=
True
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
sampler_output
(
def
sampler_output
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment