Commit f54ad7b9 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.8.5.post1-opt1-wm' into 'v0.8.5.post1-opt1'

[fix]修复并行解码integration、mtp相关单测问题

See merge request dcutoolkit/deeplearing/vllm!139
parents acfa43b8 ccaeb6c0
...@@ -9,6 +9,9 @@ import os ...@@ -9,6 +9,9 @@ import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix from ...utils import models_path_prefix
os.environ["LLAMA_NN"] = "0"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m") MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
......
...@@ -13,9 +13,12 @@ import os ...@@ -13,9 +13,12 @@ import os
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .conftest import run_equality_correctness_test_tp from .conftest import run_equality_correctness_test_tp
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix from ...utils import models_path_prefix
os.environ["LLAMA_NN"] = "0"
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.") reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -75,53 +78,42 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -75,53 +78,42 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
reason="Need at least 2 GPUs to run the test.") reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[[ [{
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "enforce_eager": True,
"--tensor_parallel_size",
"2",
# precision # Print spec metrics.
"--dtype", "tensor_parallel_size": 2,
"bfloat16",
]]) # Precision
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) "dtype": "bfloat16",
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize( # Main model
"model, test_llm_kwargs", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
[(os.path.join(models_path_prefix, "JackFram/llama-68m"), [ }])
"--speculative_config", @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
json.dumps({ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"), "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
}), },
]), }])
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
"--speculative_config",
json.dumps({
"model": os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
])])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, def test_draft_model_tp_lt_target_model_tp2(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, test_llm_kwargs, batch_size: int,
seed: int): seed: int):
"""Verify spec decode works well with smaller tp for draft models. """Verify spec decode works well with smaller tp for draft models.
""" """
run_equality_correctness_test_tp(model, run_equality_correctness_test(vllm_runner, common_llm_kwargs,
common_llm_kwargs, per_test_common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs,
baseline_llm_kwargs, batch_size, max_output_len=32, seed=seed,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0) temperature=0.0)
...@@ -129,44 +121,40 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, ...@@ -129,44 +121,40 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
reason="Need at least 2 GPUs to run the test.") reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[[ [{
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "enforce_eager": True,
"--tensor_parallel_size",
"2",
# precision # Print spec metrics.
"--dtype", "tensor_parallel_size": 2,
"bfloat16",
]]) # Precision
"dtype": "bfloat16",
# Main model
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"], [{
[ "enable_chunked_prefill": False,
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", "max_num_batched_tokens": 32,
"--max-num-seqs", "4" "max_model_len": 32,
]]) "max_num_seqs": 4
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) }])
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
[("JackFram/llama-68m", [ @pytest.mark.parametrize("test_llm_kwargs", [
"--speculative_config", {
json.dumps({ "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 5,
}), "draft_tensor_parallel_size": 1,
]), },
("JackFram/llama-68m", [ }])
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
}),
])])
@pytest.mark.parametrize("logprobs", [None]) @pytest.mark.parametrize("logprobs", [None])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, def test_spec_decode_chunked_prefill_tp2(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs,
logprobs: Optional[int], logprobs: Optional[int],
...@@ -174,69 +162,58 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, ...@@ -174,69 +162,58 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for """Verify spec decode works well with same and different TP size for
the draft model with chunked prefill. the draft model with chunked prefill.
""" """
run_equality_correctness_test_tp(model, run_equality_correctness_test(vllm_runner, common_llm_kwargs,
common_llm_kwargs, per_test_common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs,
baseline_llm_kwargs, batch_size, max_output_len=32, seed=seed,
test_llm_kwargs, temperature=0.0)
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0,
logprobs=logprobs)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.") reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[[ [{
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "enforce_eager": True,
"--tensor_parallel_size",
"2",
# precision # Print spec metrics.
"--dtype", "tensor_parallel_size": 2,
"bfloat16",
]]) # Precision
"dtype": "bfloat16",
# Main model
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"], [{
[ "enable_chunked_prefill": False,
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", "max_num_batched_tokens": 32,
"--max-num-seqs", "4" "max_model_len": 32,
]]) "max_num_seqs": 4
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) }])
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
[("JackFram/llama-68m", [ @pytest.mark.parametrize("test_llm_kwargs", [
"--speculative_config", {
json.dumps({ "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 5,
"disable_logprobs": False, "draft_tensor_parallel_size": 1,
}), },
]), }])
("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
"disable_logprobs": False,
}),
])])
@pytest.mark.parametrize("logprobs", [2]) @pytest.mark.parametrize("logprobs", [2])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2_with_logprobs( def test_spec_decode_chunked_prefill_tp2_with_logprobs(
model, common_llm_kwargs, per_test_common_llm_kwargs, vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int], baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
batch_size: int, seed: int): batch_size: int, seed: int):
"""Verify spec decode works well with same and different TP size for """Verify spec decode works well with same and different TP size for
the draft model with chunked prefill. the draft model with chunked prefill.
""" """
run_equality_correctness_test_tp(model, run_equality_correctness_test(vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
......
...@@ -11,8 +11,11 @@ import torch ...@@ -11,8 +11,11 @@ import torch
import os import os
from .conftest import run_equality_correctness_test_tp from .conftest import run_equality_correctness_test_tp
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix from ...utils import models_path_prefix
os.environ["LLAMA_NN"] = "0"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m") MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m") SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
...@@ -21,46 +24,44 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m") ...@@ -21,46 +24,44 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
reason="Need at least 4 GPUs to run the test.") reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[[ [{
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce_eager", "enforce_eager": True,
"--tensor-parallel-size",
"4", # Print spec metrics.
]]) "tensor_parallel_size": 4,
# Precision
"dtype": "bfloat16",
# Main model
"model_name": MAIN_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
[], {},
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize( @pytest.mark.parametrize("test_llm_kwargs", [
"test_llm_kwargs", {
[ "speculative_config": {
#TODO(wooyeon): add spec_draft_dp=2 case "model": SPEC_MODEL,
[ "num_speculative_tokens": 5,
"--speculative_config", "draft_tensor_parallel_size": 1,
json.dumps({ },
"model": f"{SPEC_MODEL}", }])
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
],
])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, def test_draft_model_tp_lt_target_model_tp4(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, test_llm_kwargs, batch_size: int,
seed: int): seed: int):
"""Verify spec decode works well with smaller tp for draft models. """Verify spec decode works well with smaller tp for draft models.
""" """
run_equality_correctness_test_tp(MAIN_MODEL, run_equality_correctness_test(vllm_runner, common_llm_kwargs,
common_llm_kwargs, per_test_common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs,
baseline_llm_kwargs, batch_size, max_output_len=32, seed=seed,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0) temperature=0.0)
...@@ -68,30 +69,30 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, ...@@ -68,30 +69,30 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
reason="Need at least 4 GPUs to run the test.") reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[[ [{
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"--enforce-eager", "enforce_eager": True,
"--tensor-parallel-size",
"4", # Print spec metrics.
]]) "tensor_parallel_size": 4,
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) # Precision
@pytest.mark.parametrize( "dtype": "bfloat16",
"test_llm_kwargs",
[ # Main model
[ "model_name": MAIN_MODEL,
# Artificially limit the draft model max model len; this forces vLLM }])
# to skip speculation once the sequences grow beyond 32-k tokens. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
"--speculative_config", @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
json.dumps({ @pytest.mark.parametrize("test_llm_kwargs", [
"model": f"{SPEC_MODEL}", {
"num_speculative_tokens": 5, "speculative_config": {
"max_model_len": 32, "model": SPEC_MODEL,
}), "num_speculative_tokens": 5,
], "max_model_len": 32,
]) },
@pytest.mark.parametrize("batch_size", [8]) }])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
[ [
...@@ -101,7 +102,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, ...@@ -101,7 +102,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
64, 64,
]) ])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs, def test_skip_speculation(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int, seed: int): batch_size: int, output_len: int, seed: int):
"""Verify job failure with RuntimeError when all sequences skip speculation. """Verify job failure with RuntimeError when all sequences skip speculation.
...@@ -111,14 +112,9 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -111,14 +112,9 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
TODO: fix it to pass without raising Error. (#5814) TODO: fix it to pass without raising Error. (#5814)
""" """
with pytest.raises( with pytest.raises(RuntimeError):
(openai.APIConnectionError, openai.InternalServerError)): run_equality_correctness_test(vllm_runner, common_llm_kwargs,
run_equality_correctness_test_tp(MAIN_MODEL, per_test_common_llm_kwargs,
common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs,
per_test_common_llm_kwargs, batch_size, max_output_len=output_len, seed=seed,
baseline_llm_kwargs, temperature=0.0)
test_llm_kwargs, \ No newline at end of file
batch_size,
output_len,
seed,
temperature=0.0)
...@@ -26,8 +26,10 @@ import pytest ...@@ -26,8 +26,10 @@ import pytest
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix from ...utils import models_path_prefix
os.environ["VLLM_MLA_DISABLE"] = "1"
# main model # main model
MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random") MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random_bf16")
# max. number of speculative tokens: this corresponds to # max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model. # num_nextn_predict_layers in the config.json of the speculator model.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment