Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5c77fabd
Commit
5c77fabd
authored
Jun 13, 2025
by
王敏
Browse files
[fix]修复并行解码integration、mtp相关单测问题
parent
acfa43b8
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
153 additions
and
175 deletions
+153
-175
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+3
-0
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
+86
-109
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
+60
-64
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
+4
-2
No files found.
tests/spec_decode/e2e/test_integration.py
View file @
5c77fabd
...
...
@@ -9,6 +9,9 @@ import os
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
os
.
environ
[
"LLAMA_NN"
]
=
"0"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
...
...
tests/spec_decode/e2e/test_integration_dist_tp2.py
View file @
5c77fabd
...
...
@@ -13,9 +13,12 @@ import os
from
vllm.platforms
import
current_platform
from
.conftest
import
run_equality_correctness_test_tp
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
os
.
environ
[
"LLAMA_NN"
]
=
"0"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -75,53 +78,42 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
"enforce_eager"
:
True
,
# precision
"--dtype"
,
"bfloat16"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
[
"--speculative_config"
,
json
.
dumps
({
# Print spec metrics.
"tensor_parallel_size"
:
2
,
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
]),
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3b-code-instruct"
),
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3b-code-instruct"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
},
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_draft_model_tp_lt_target_model_tp2
(
model
,
common_llm_kwargs
,
def
test_draft_model_tp_lt_target_model_tp2
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
...
...
@@ -129,44 +121,40 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
"enforce_eager"
:
True
,
# precision
"--dtype"
,
"bfloat16"
,
]])
# Print spec metrics.
"tensor_parallel_size"
:
2
,
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"--max-num-seqs"
,
"4"
]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
}),
]),
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
[{
"enable_chunked_prefill"
:
False
,
"max_num_batched_tokens"
:
32
,
"max_model_len"
:
32
,
"max_num_seqs"
:
4
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
},
}])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2
(
model
,
common_llm_kwargs
,
def
test_spec_decode_chunked_prefill_tp2
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
...
...
@@ -174,69 +162,58 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
,
logprobs
=
logprobs
)
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
"enforce_eager"
:
True
,
# precision
"--dtype"
,
"bfloat16"
,
]])
# Print spec metrics.
"tensor_parallel_size"
:
2
,
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"--max-num-seqs"
,
"4"
]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"disable_logprobs"
:
False
,
}),
]),
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
"disable_logprobs"
:
False
,
}),
])])
[{
"enable_chunked_prefill"
:
False
,
"max_num_batched_tokens"
:
32
,
"max_model_len"
:
32
,
"max_num_seqs"
:
4
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
},
}])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2_with_logprobs
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test
_tp
(
model
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
...
...
tests/spec_decode/e2e/test_integration_dist_tp4.py
View file @
5c77fabd
...
...
@@ -11,8 +11,11 @@ import torch
import
os
from
.conftest
import
run_equality_correctness_test_tp
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
os
.
environ
[
"LLAMA_NN"
]
=
"0"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
SPEC_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
...
...
@@ -21,46 +24,44 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
"--enforce_eager"
,
"--tensor-parallel-size"
,
"4"
,
]])
"enforce_eager"
:
True
,
# Print spec metrics.
"tensor_parallel_size"
:
4
,
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
[]
,
{}
,
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
f
"
{
SPEC_MODEL
}
"
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
],
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
},
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_draft_model_tp_lt_target_model_tp4
(
common_llm_kwargs
,
def
test_draft_model_tp_lt_target_model_tp4
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_equality_correctness_test_tp
(
MAIN_MODEL
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
...
...
@@ -68,30 +69,30 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
[{
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor-parallel-size"
,
"4"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
[
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config"
,
json
.
dumps
({
"model"
:
f
"
{
SPEC_MODEL
}
"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
32
,
}),
],
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
"enforce_eager"
:
True
,
# Print spec metrics.
"tensor_parallel_size"
:
4
,
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
32
,
},
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
...
...
@@ -101,7 +102,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
64
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_skip_speculation
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
def
test_skip_speculation
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify job failure with RuntimeError when all sequences skip speculation.
...
...
@@ -111,14 +112,9 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
TODO: fix it to pass without raising Error. (#5814)
"""
with
pytest
.
raises
(
(
openai
.
APIConnectionError
,
openai
.
InternalServerError
)):
run_equality_correctness_test_tp
(
MAIN_MODEL
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
with
pytest
.
raises
(
RuntimeError
):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
\ No newline at end of file
tests/spec_decode/e2e/test_mtp_correctness.py
View file @
5c77fabd
...
...
@@ -26,8 +26,10 @@ import pytest
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
os
.
environ
[
"VLLM_MLA_DISABLE"
]
=
"1"
# main model
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random"
)
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random
_bf16
"
)
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
...
...
@@ -188,7 +190,7 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment