Commit 5c77fabd authored by 王敏's avatar 王敏
Browse files

[fix]修复并行解码integration、mtp相关单测问题

parent acfa43b8
......@@ -9,6 +9,9 @@ import os
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
os.environ["LLAMA_NN"] = "0"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
......
......@@ -13,9 +13,12 @@ import os
from vllm.platforms import current_platform
from .conftest import run_equality_correctness_test_tp
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
os.environ["LLAMA_NN"] = "0"
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
......@@ -75,53 +78,42 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
[{
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor_parallel_size",
"2",
"enforce_eager": True,
# precision
"--dtype",
"bfloat16",
]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize(
"model, test_llm_kwargs",
[(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
"--speculative_config",
json.dumps({
# Print spec metrics.
"tensor_parallel_size": 2,
# Precision
"dtype": "bfloat16",
# Main model
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
]),
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
"--speculative_config",
json.dumps({
"model": os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
])])
},
}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
def test_draft_model_tp_lt_target_model_tp2(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs, batch_size: int,
seed: int):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_equality_correctness_test_tp(model,
common_llm_kwargs,
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, max_output_len=32, seed=seed,
temperature=0.0)
......@@ -129,44 +121,40 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
[{
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor_parallel_size",
"2",
"enforce_eager": True,
# precision
"--dtype",
"bfloat16",
]])
# Print spec metrics.
"tensor_parallel_size": 2,
# Precision
"dtype": "bfloat16",
# Main model
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"],
[
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
"--max-num-seqs", "4"
]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
[{
"enable_chunked_prefill": False,
"max_num_batched_tokens": 32,
"max_model_len": 32,
"max_num_seqs": 4
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
])])
},
}])
@pytest.mark.parametrize("logprobs", [None])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
def test_spec_decode_chunked_prefill_tp2(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
logprobs: Optional[int],
......@@ -174,69 +162,58 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp(model,
common_llm_kwargs,
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0,
logprobs=logprobs)
baseline_llm_kwargs, test_llm_kwargs,
batch_size, max_output_len=32, seed=seed,
temperature=0.0)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
[{
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor_parallel_size",
"2",
"enforce_eager": True,
# precision
"--dtype",
"bfloat16",
]])
# Print spec metrics.
"tensor_parallel_size": 2,
# Precision
"dtype": "bfloat16",
# Main model
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"],
[
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
"--max-num-seqs", "4"
]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"disable_logprobs": False,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
[{
"enable_chunked_prefill": False,
"max_num_batched_tokens": 32,
"max_model_len": 32,
"max_num_seqs": 4
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
"disable_logprobs": False,
}),
])])
},
}])
@pytest.mark.parametrize("logprobs", [2])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2_with_logprobs(
model, common_llm_kwargs, per_test_common_llm_kwargs,
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
batch_size: int, seed: int):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp(model,
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
......
......@@ -11,8 +11,11 @@ import torch
import os
from .conftest import run_equality_correctness_test_tp
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
os.environ["LLAMA_NN"] = "0"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
......@@ -21,46 +24,44 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
[{
# Skip cuda graph recording for fast test.
"--enforce_eager",
"--tensor-parallel-size",
"4",
]])
"enforce_eager": True,
# Print spec metrics.
"tensor_parallel_size": 4,
# Precision
"dtype": "bfloat16",
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
[],
{},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config",
json.dumps({
"model": f"{SPEC_MODEL}",
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
],
])
},
}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
def test_draft_model_tp_lt_target_model_tp4(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs, batch_size: int,
seed: int):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_equality_correctness_test_tp(MAIN_MODEL,
common_llm_kwargs,
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, max_output_len=32, seed=seed,
temperature=0.0)
......@@ -68,30 +69,30 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
[{
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor-parallel-size",
"4",
]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
[
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config",
json.dumps({
"model": f"{SPEC_MODEL}",
"enforce_eager": True,
# Print spec metrics.
"tensor_parallel_size": 4,
# Precision
"dtype": "bfloat16",
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": 5,
"max_model_len": 32,
}),
],
])
@pytest.mark.parametrize("batch_size", [8])
},
}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize(
"output_len",
[
......@@ -101,7 +102,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
64,
])
@pytest.mark.parametrize("seed", [1])
def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
def test_skip_speculation(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int, seed: int):
"""Verify job failure with RuntimeError when all sequences skip speculation.
......@@ -111,14 +112,9 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
TODO: fix it to pass without raising Error. (#5814)
"""
with pytest.raises(
(openai.APIConnectionError, openai.InternalServerError)):
run_equality_correctness_test_tp(MAIN_MODEL,
common_llm_kwargs,
with pytest.raises(RuntimeError):
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, max_output_len=output_len, seed=seed,
temperature=0.0)
\ No newline at end of file
......@@ -26,8 +26,10 @@ import pytest
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
os.environ["VLLM_MLA_DISABLE"] = "1"
# main model
MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")
MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random_bf16")
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
......@@ -188,7 +190,7 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"block_size": 8,
"block_size": 16,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment