Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5c77fabd
Commit
5c77fabd
authored
Jun 13, 2025
by
王敏
Browse files
[fix]修复并行解码integration、mtp相关单测问题
parent
acfa43b8
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
153 additions
and
175 deletions
+153
-175
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+3
-0
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
+86
-109
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
+60
-64
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
+4
-2
No files found.
tests/spec_decode/e2e/test_integration.py
View file @
5c77fabd
...
@@ -9,6 +9,9 @@ import os
...
@@ -9,6 +9,9 @@ import os
from
.conftest
import
run_equality_correctness_test
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
os
.
environ
[
"LLAMA_NN"
]
=
"0"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
...
...
tests/spec_decode/e2e/test_integration_dist_tp2.py
View file @
5c77fabd
...
@@ -13,9 +13,12 @@ import os
...
@@ -13,9 +13,12 @@ import os
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
.conftest
import
run_equality_correctness_test_tp
from
.conftest
import
run_equality_correctness_test_tp
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
os
.
environ
[
"LLAMA_NN"
]
=
"0"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -75,53 +78,42 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
...
@@ -75,53 +78,42 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"enforce_eager"
:
True
,
"--tensor_parallel_size"
,
"2"
,
# precision
# Print spec metrics.
"--dtype"
,
"tensor_parallel_size"
:
2
,
"bfloat16"
,
]])
# Precision
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
"dtype"
:
"bfloat16"
,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
# Main model
"model, test_llm_kwargs"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
[(
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
[
}])
"--speculative_config"
,
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
json
.
dumps
({
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
"draft_tensor_parallel_size"
:
1
,
}),
},
]),
}])
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3b-code-instruct"
),
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3b-code-instruct"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_draft_model_tp_lt_target_model_tp2
(
model
,
common_llm_kwargs
,
def
test_draft_model_tp_lt_target_model_tp2
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
seed
:
int
):
"""Verify spec decode works well with smaller tp for draft models.
"""Verify spec decode works well with smaller tp for draft models.
"""
"""
run_equality_correctness_test_tp
(
model
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
baseline_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
temperature
=
0.0
)
...
@@ -129,44 +121,40 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
...
@@ -129,44 +121,40 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"enforce_eager"
:
True
,
"--tensor_parallel_size"
,
"2"
,
# precision
# Print spec metrics.
"--dtype"
,
"tensor_parallel_size"
:
2
,
"bfloat16"
,
]])
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[{
[
"enable_chunked_prefill"
:
False
,
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"max_num_batched_tokens"
:
32
,
"--max-num-seqs"
,
"4"
"max_model_len"
:
32
,
]])
"max_num_seqs"
:
4
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
}])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
[(
"JackFram/llama-68m"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
"--speculative_config"
,
{
json
.
dumps
({
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
3
,
"num_speculative_tokens"
:
5
,
}),
"draft_tensor_parallel_size"
:
1
,
]),
},
(
"JackFram/llama-68m"
,
[
}])
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2
(
model
,
common_llm_kwargs
,
def
test_spec_decode_chunked_prefill_tp2
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
logprobs
:
Optional
[
int
],
...
@@ -174,69 +162,58 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
...
@@ -174,69 +162,58 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
the draft model with chunked prefill.
"""
"""
run_equality_correctness_test_tp
(
model
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
baseline_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
test_llm_kwargs
,
temperature
=
0.0
)
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
,
logprobs
=
logprobs
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"enforce_eager"
:
True
,
"--tensor_parallel_size"
,
"2"
,
# precision
# Print spec metrics.
"--dtype"
,
"tensor_parallel_size"
:
2
,
"bfloat16"
,
]])
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[{
[
"enable_chunked_prefill"
:
False
,
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"max_num_batched_tokens"
:
32
,
"--max-num-seqs"
,
"4"
"max_model_len"
:
32
,
]])
"max_num_seqs"
:
4
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
}])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
[(
"JackFram/llama-68m"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
"--speculative_config"
,
{
json
.
dumps
({
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
3
,
"num_speculative_tokens"
:
5
,
"disable_logprobs"
:
False
,
"draft_tensor_parallel_size"
:
1
,
}),
},
]),
}])
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
"disable_logprobs"
:
False
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2_with_logprobs
(
def
test_spec_decode_chunked_prefill_tp2_with_logprobs
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
batch_size
:
int
,
seed
:
int
):
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with same and different TP size for
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
the draft model with chunked prefill.
"""
"""
run_equality_correctness_test
_tp
(
model
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
baseline_llm_kwargs
,
...
...
tests/spec_decode/e2e/test_integration_dist_tp4.py
View file @
5c77fabd
...
@@ -11,8 +11,11 @@ import torch
...
@@ -11,8 +11,11 @@ import torch
import
os
import
os
from
.conftest
import
run_equality_correctness_test_tp
from
.conftest
import
run_equality_correctness_test_tp
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
os
.
environ
[
"LLAMA_NN"
]
=
"0"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
SPEC_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
SPEC_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
...
@@ -21,46 +24,44 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
...
@@ -21,46 +24,44 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
reason
=
"Need at least 4 GPUs to run the test."
)
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[
[
[
{
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"--enforce_eager"
,
"enforce_eager"
:
True
,
"--tensor-parallel-size"
,
"4"
,
# Print spec metrics.
]])
"tensor_parallel_size"
:
4
,
# Precision
"dtype"
:
"bfloat16"
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
[]
,
{}
,
])
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
"test_llm_kwargs"
,
{
[
"speculative_config"
:
{
#TODO(wooyeon): add spec_draft_dp=2 case
"model"
:
SPEC_MODEL
,
[
"num_speculative_tokens"
:
5
,
"--speculative_config"
,
"draft_tensor_parallel_size"
:
1
,
json
.
dumps
({
},
"model"
:
f
"
{
SPEC_MODEL
}
"
,
}])
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
],
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_draft_model_tp_lt_target_model_tp4
(
common_llm_kwargs
,
def
test_draft_model_tp_lt_target_model_tp4
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
seed
:
int
):
"""Verify spec decode works well with smaller tp for draft models.
"""Verify spec decode works well with smaller tp for draft models.
"""
"""
run_equality_correctness_test_tp
(
MAIN_MODEL
,
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
baseline_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
temperature
=
0.0
)
...
@@ -68,30 +69,30 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
...
@@ -68,30 +69,30 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
reason
=
"Need at least 4 GPUs to run the test."
)
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[[
[{
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"enforce_eager"
:
True
,
"--tensor-parallel-size"
,
"4"
,
# Print spec metrics.
]])
"tensor_parallel_size"
:
4
,
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
# Precision
@
pytest
.
mark
.
parametrize
(
"dtype"
:
"bfloat16"
,
"test_llm_kwargs"
,
[
# Main model
[
"model_name"
:
MAIN_MODEL
,
# Artificially limit the draft model max model len; this forces vLLM
}])
# to skip speculation once the sequences grow beyond 32-k tokens.
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
"--speculative_config"
,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
json
.
dumps
({
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
"model"
:
f
"
{
SPEC_MODEL
}
"
,
{
"num_speculative_tokens"
:
5
,
"speculative_config"
:
{
"max_model_len"
:
32
,
"model"
:
SPEC_MODEL
,
}),
"num_speculative_tokens"
:
5
,
],
"max_model_len"
:
32
,
])
},
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"output_len"
,
"output_len"
,
[
[
...
@@ -101,7 +102,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
...
@@ -101,7 +102,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
64
,
64
,
])
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_skip_speculation
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
def
test_skip_speculation
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify job failure with RuntimeError when all sequences skip speculation.
"""Verify job failure with RuntimeError when all sequences skip speculation.
...
@@ -111,14 +112,9 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
...
@@ -111,14 +112,9 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
TODO: fix it to pass without raising Error. (#5814)
TODO: fix it to pass without raising Error. (#5814)
"""
"""
with
pytest
.
raises
(
with
pytest
.
raises
(
RuntimeError
):
(
openai
.
APIConnectionError
,
openai
.
InternalServerError
)):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
run_equality_correctness_test_tp
(
MAIN_MODEL
,
per_test_common_llm_kwargs
,
common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
per_test_common_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
baseline_llm_kwargs
,
temperature
=
0.0
)
test_llm_kwargs
,
\ No newline at end of file
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
tests/spec_decode/e2e/test_mtp_correctness.py
View file @
5c77fabd
...
@@ -26,8 +26,10 @@ import pytest
...
@@ -26,8 +26,10 @@ import pytest
from
.conftest
import
run_equality_correctness_test
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
os
.
environ
[
"VLLM_MLA_DISABLE"
]
=
"1"
# main model
# main model
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random"
)
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random
_bf16
"
)
# max. number of speculative tokens: this corresponds to
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
# num_nextn_predict_layers in the config.json of the speculator model.
...
@@ -188,7 +190,7 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
...
@@ -188,7 +190,7 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment