"tests/vscode:/vscode.git/clone" did not exist on "1a55cfafcbed71c68a6217f5e7b2929014e6df2d"
Commit 53076d70 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-ori

parents 322a0be6 9c5c81b0
...@@ -23,9 +23,11 @@ MAIN_MODEL = "JackFram/llama-68m" ...@@ -23,9 +23,11 @@ MAIN_MODEL = "JackFram/llama-68m"
[ [
{ {
# Identical models. # Identical models.
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
},
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{}])
...@@ -57,26 +59,33 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -57,26 +59,33 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [])
{
"speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"num_speculative_tokens": 5,
},
])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_llm_kwargs", "test_llm_kwargs",
[ [
# Explicitly specify draft model quantization # Explicitly specify draft model quantization
{ {
"speculative_model_quantization": "gptq", "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"num_speculative_tokens": 5,
"quantization": "gptq",
},
}, },
# Explicitly specify GPTQ-based draft model to use marlin quantization # Explicitly specify GPTQ-based draft model to use marlin quantization
{ {
"speculative_model_quantization": "marlin", "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"num_speculative_tokens": 5,
"quantization": "marlin",
},
}, },
# Not explicitly specify draft model quantization # Not explicitly specify draft model quantization
{ {
"speculative_model_quantization": None, "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"num_speculative_tokens": 5,
"quantization": None,
},
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -107,15 +116,16 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, ...@@ -107,15 +116,16 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_disable_mqa_scorer": True, "model": "JackFram/llama-68m",
}]) "num_speculative_tokens": 3,
"disable_mqa_scorer": True,
},
}])
@pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -127,7 +137,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, ...@@ -127,7 +137,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
output_len: int, seed: int): output_len: int, seed: int):
"""Verify that ngram speculative decoding generates the same output """Verify that speculative decoding generates the same output
with batch expansion scorer and mqa scorer. with batch expansion scorer and mqa scorer.
""" """
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(vllm_runner,
......
...@@ -27,18 +27,19 @@ from .conftest import run_equality_correctness_test_tp ...@@ -27,18 +27,19 @@ from .conftest import run_equality_correctness_test_tp
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
[ [
"--speculative-model", "--speculative_config",
"JackFram/llama-68m", str({
"--num-speculative-tokens", "model": "JackFram/llama-68m",
"3", "num_speculative_tokens": 3,
}),
], ],
[ [
"--speculative-model", "--speculative_config",
"[ngram]", str({
"--num-speculative-tokens", "model": "ngram",
"5", "num_speculative_tokens": 5,
"--ngram-prompt-lookup-max", "prompt_lookup_max": 3,
"3", }),
], ],
]) ])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
...@@ -83,22 +84,23 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -83,22 +84,23 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
]]) ]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize(
"model, test_llm_kwargs",
[("JackFram/llama-68m", [ [("JackFram/llama-68m", [
"--speculative-model", "--speculative_config",
"JackFram/llama-68m", str({
"--num_speculative-tokens", "model": "JackFram/llama-68m",
"5", "num_speculative_tokens": 5,
"--speculative-draft-tensor-parallel-size", "draft_tensor_parallel_size": 1,
"1", }),
]), ]),
("ibm-granite/granite-3b-code-instruct", [ ("ibm-granite/granite-3b-code-instruct", [
"--speculative-model", "--speculative_config",
"ibm-granite/granite-3b-code-instruct", str({
"--num_speculative-tokens", "model": "ibm-granite/granite-3b-code-instruct",
"5", "num_speculative_tokens": 5,
"--speculative-draft-tensor-parallel-size", "draft_tensor_parallel_size": 1,
"1", }),
])]) ])])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
...@@ -144,18 +146,19 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, ...@@ -144,18 +146,19 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [ [("JackFram/llama-68m", [
"--speculative-model", "--speculative_config",
"JackFram/llama-68m", str({
"--num_speculative-tokens", "model": "JackFram/llama-68m",
"3", "num_speculative_tokens": 3,
}),
]), ]),
("JackFram/llama-68m", [ ("JackFram/llama-68m", [
"--speculative-model", "--speculative_config",
"JackFram/llama-68m", str({
"--num_speculative-tokens", "model": "JackFram/llama-68m",
"3", "num_speculative_tokens": 3,
"--speculative-draft-tensor-parallel-size", "draft_tensor_parallel_size": 1,
"1", }),
])]) ])])
@pytest.mark.parametrize("logprobs", [None, 2]) @pytest.mark.parametrize("logprobs", [None, 2])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
......
...@@ -24,12 +24,7 @@ SPEC_MODEL = "JackFram/llama-68m" ...@@ -24,12 +24,7 @@ SPEC_MODEL = "JackFram/llama-68m"
"4", "4",
]]) ]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
[ [],
"--speculative-model",
f"{SPEC_MODEL}",
"--num-speculative-tokens",
"5",
],
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -37,8 +32,12 @@ SPEC_MODEL = "JackFram/llama-68m" ...@@ -37,8 +32,12 @@ SPEC_MODEL = "JackFram/llama-68m"
[ [
#TODO(wooyeon): add spec_draft_dp=2 case #TODO(wooyeon): add spec_draft_dp=2 case
[ [
"--speculative-draft-tensor-parallel-size", "--speculative_config",
"1", str({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
], ],
]) ])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
...@@ -78,15 +77,14 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, ...@@ -78,15 +77,14 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
[ [
"--speculative-model",
f"{SPEC_MODEL}",
"--num-speculative-tokens",
"5",
# Artificially limit the draft model max model len; this forces vLLM # Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens. # to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative-max-model-len", "--speculative_config",
"32", str({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"max_model_len": 32,
}),
], ],
]) ])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
......
...@@ -20,16 +20,19 @@ from .conftest import run_equality_correctness_test ...@@ -20,16 +20,19 @@ from .conftest import run_equality_correctness_test
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": "JackFram/llama-68m", "model": "JackFram/llama-68m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
}, { },
"speculative_model": "JackFram/llama-68m", }, {
"speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": True, "disable_logprobs": True,
}]) },
}])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -48,7 +51,8 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, ...@@ -48,7 +51,8 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
as well as with and without chunked prefill. as well as with and without chunked prefill.
""" """
maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -59,8 +63,8 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, ...@@ -59,8 +63,8 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
temperature=0.0, temperature=0.0,
logprobs=logprobs, logprobs=logprobs,
prompt_logprobs=logprobs, prompt_logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -73,16 +77,19 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, ...@@ -73,16 +77,19 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": "JackFram/llama-160m", "model": "JackFram/llama-160m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
}, { },
"speculative_model": "JackFram/llama-160m", }, {
"speculative_config": {
"model": "JackFram/llama-160m",
"num_speculative_tokens": 6, "num_speculative_tokens": 6,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
}]) },
}])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -98,7 +105,8 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, ...@@ -98,7 +105,8 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
output_len: int, seed: int, logprobs: int): output_len: int, seed: int, logprobs: int):
"""Veriy logprob greedy equality with different speculation lens. """Veriy logprob greedy equality with different speculation lens.
""" """
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -108,8 +116,8 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, ...@@ -108,8 +116,8 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
seed, seed,
temperature=0.0, temperature=0.0,
logprobs=logprobs, logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -125,13 +133,15 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, ...@@ -125,13 +133,15 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_llm_kwargs", "test_llm_kwargs",
[{ [{
"speculative_model": "JackFram/llama-160m", "speculative_config": {
"model": "JackFram/llama-160m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
# Artificially limit the draft model max model len; this forces
# Artificially limit the draft model max model len; this forces vLLM # vLLM to skip speculation once the sequences grow beyond 32-k
# to skip speculation once the sequences grow beyond 32-k tokens. # tokens.
"speculative_max_model_len": 32, "max_model_len": 32,
},
}]) }])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -149,7 +159,8 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -149,7 +159,8 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
seed: int, logprobs: int): seed: int, logprobs: int):
"""Verify logprobs greedy equality when some sequences skip speculation. """Verify logprobs greedy equality when some sequences skip speculation.
""" """
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -159,8 +170,8 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -159,8 +170,8 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
seed, seed,
temperature=0.0, temperature=0.0,
logprobs=logprobs, logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -173,12 +184,13 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -173,12 +184,13 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": "JackFram/llama-160m", "model": "JackFram/llama-160m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
}]) },
}])
@pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -248,12 +260,13 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, ...@@ -248,12 +260,13 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": "JackFram/llama-68m", "model": "JackFram/llama-68m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": True, "disable_logprobs": True,
}]) },
}])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -270,7 +283,8 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs, ...@@ -270,7 +283,8 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
"""Check the behavior when logprobs are disabled. """Check the behavior when logprobs are disabled.
Token choices should match with the base model. Token choices should match with the base model.
""" """
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -280,5 +294,5 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs, ...@@ -280,5 +294,5 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
seed, seed,
temperature=0.0, temperature=0.0,
logprobs=logprobs, logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
...@@ -60,9 +60,11 @@ PRECISION = "float32" ...@@ -60,9 +60,11 @@ PRECISION = "float32"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
},
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
128, 128,
...@@ -107,14 +109,18 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -107,14 +109,18 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
},
}, },
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs_during_spec_decoding": True, "disable_logprobs": True,
},
}, },
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
...@@ -132,7 +138,8 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -132,7 +138,8 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
prefill_chunk_size: int): prefill_chunk_size: int):
"""Verify greedy equality with different batch size.""" """Verify greedy equality with different batch size."""
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -143,8 +150,8 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -143,8 +150,8 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
temperature=0.0, temperature=0.0,
logprobs=logprobs, logprobs=logprobs,
prompt_logprobs=logprobs, prompt_logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -165,9 +172,11 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -165,9 +172,11 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
},
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
128, 128,
...@@ -214,9 +223,11 @@ def test_medusa_e2e_greedy_correctness_cuda_graph( ...@@ -214,9 +223,11 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
},
]) ])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -264,8 +275,10 @@ def test_medusa_e2e_greedy_correctness_with_preemption( ...@@ -264,8 +275,10 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": k, "num_speculative_tokens": k,
},
} }
# Try a range of num. speculative tokens # Try a range of num. speculative tokens
for k in range(1, 1 + MAX_SPEC_TOKENS) for k in range(1, 1 + MAX_SPEC_TOKENS)
...@@ -312,12 +325,13 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs, ...@@ -312,12 +325,13 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": SPEC_MODEL, "model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
"speculative_disable_by_batch_size": 4 "disable_by_batch_size": 4,
}]) },
}])
@pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -359,16 +373,17 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -359,16 +373,17 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
# Main model # Main model
"model_name": MAIN_MODEL, "model_name": MAIN_MODEL,
"speculative_model": SPEC_MODEL,
"num_speculative_tokens": MAX_SPEC_TOKENS,
"speculative_disable_by_batch_size": 4
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_disable_mqa_scorer": True, "model": SPEC_MODEL,
}]) "num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_by_batch_size": 4,
"disable_mqa_scorer": True,
},
}])
@pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
......
...@@ -62,7 +62,9 @@ PRECISION = "float32" ...@@ -62,7 +62,9 @@ PRECISION = "float32"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
},
}, },
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
...@@ -108,12 +110,16 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -108,12 +110,16 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"disable_logprobs_during_spec_decoding": False, "model": SPEC_MODEL,
"disable_logprobs": False,
},
}, },
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"disable_logprobs_during_spec_decoding": True, "model": SPEC_MODEL,
"disable_logprobs": True,
},
}, },
]) ])
@pytest.mark.parametrize("output_len", [8]) @pytest.mark.parametrize("output_len", [8])
...@@ -133,7 +139,8 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -133,7 +139,8 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# up sampling different tokens at the tail (ie top tokens don't change). # up sampling different tokens at the tail (ie top tokens don't change).
# TL;DR: sd+cp == org+cp but sd+cp != org..is this expected? # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -144,8 +151,8 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -144,8 +151,8 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
temperature=0.0, temperature=0.0,
logprobs=logprobs, logprobs=logprobs,
prompt_logprobs=logprobs, prompt_logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -167,7 +174,9 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -167,7 +174,9 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
},
}, },
]) ])
@pytest.mark.parametrize("output_len", [2048]) @pytest.mark.parametrize("output_len", [2048])
...@@ -209,8 +218,10 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, ...@@ -209,8 +218,10 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
# Main model # Main model
"model_name": MAIN_MODEL, "model_name": MAIN_MODEL,
# Speculative model # Speculative config
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
},
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
...@@ -274,7 +285,9 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, ...@@ -274,7 +285,9 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
},
}, },
]) ])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -326,7 +339,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption( ...@@ -326,7 +339,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
},
}, },
]) ])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -382,8 +397,10 @@ def test_mlp_e2e_greedy_correctness_with_padding( ...@@ -382,8 +397,10 @@ def test_mlp_e2e_greedy_correctness_with_padding(
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": SPEC_MODEL, "speculative_config": {
"model": SPEC_MODEL,
"num_speculative_tokens": k, "num_speculative_tokens": k,
},
} }
# Try a range of num. speculative tokens # Try a range of num. speculative tokens
for k in range(1, 1 + MAX_SPEC_TOKENS) for k in range(1, 1 + MAX_SPEC_TOKENS)
...@@ -430,11 +447,12 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs, ...@@ -430,11 +447,12 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": SPEC_MODEL, "model": SPEC_MODEL,
"speculative_disable_by_batch_size": 4 "disable_by_batch_size": 4,
}]) },
}])
@pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -475,14 +493,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -475,14 +493,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
"speculative_model": SPEC_MODEL,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_disable_mqa_scorer": True, "model": SPEC_MODEL,
}]) "disable_mqa_scorer": True,
},
}])
@pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
......
...@@ -57,8 +57,10 @@ PRECISION = "bfloat16" ...@@ -57,8 +57,10 @@ PRECISION = "bfloat16"
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": {
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
},
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
128, 128,
...@@ -99,12 +101,16 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -99,12 +101,16 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": {
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs": False,
},
}, },
{ {
"speculative_config": {
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs_during_spec_decoding": True, "disable_logprobs": True,
},
}, },
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
...@@ -119,7 +125,8 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -119,7 +125,8 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
batch_size: int, output_len: int, seed: int, batch_size: int, output_len: int, seed: int,
logprobs: int): logprobs: int):
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(
vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -129,8 +136,8 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -129,8 +136,8 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
seed, seed,
logprobs=logprobs, logprobs=logprobs,
prompt_logprobs=logprobs, prompt_logprobs=logprobs,
disable_logprobs=test_llm_kwargs[ disable_logprobs=test_llm_kwargs["speculative_config"]
'disable_logprobs_during_spec_decoding']) ["disable_logprobs"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -152,8 +159,10 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -152,8 +159,10 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": {
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
},
]) ])
@pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
128, 128,
...@@ -198,8 +207,10 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -198,8 +207,10 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": {
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
},
]) ])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
...@@ -243,7 +254,9 @@ def test_mtp_e2e_greedy_correctness_with_preemption( ...@@ -243,7 +254,9 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_config": {
"num_speculative_tokens": k, "num_speculative_tokens": k,
},
} }
# Try a range of num. speculative tokens # Try a range of num. speculative tokens
for k in range(1, 1 + MAX_SPEC_TOKENS) for k in range(1, 1 + MAX_SPEC_TOKENS)
...@@ -286,11 +299,12 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs, ...@@ -286,11 +299,12 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
"speculative_disable_by_batch_size": 4 "disable_by_batch_size": 4
}]) },
}])
@pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
......
...@@ -61,15 +61,19 @@ from .conftest import (get_output_from_llm_generator, ...@@ -61,15 +61,19 @@ from .conftest import (get_output_from_llm_generator,
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
# Chunked prefill enabled with small value # Chunked prefill enabled with small value
# to make sure we get mixed batches. # to make sure we get mixed batches.
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -148,20 +152,23 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, ...@@ -148,20 +152,23 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs", [{
[{ "speculative_config": {
"speculative_model": "JackFram/llama-68m", "model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"disable_logprobs": False,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
"disable_logprobs_during_spec_decoding": False }, {
}, { "speculative_config": {
"speculative_model": "JackFram/llama-68m", "model": "JackFram/llama-68m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs": False,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4, "max_num_seqs": 4,
"disable_logprobs_during_spec_decoding": False }])
}])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"output_len", "output_len",
[ [
...@@ -184,7 +191,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( ...@@ -184,7 +191,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
whether all speculative tokens are accepted. whether all speculative tokens are accepted.
""" """
ensure_all_accepted = per_test_common_llm_kwargs.get( ensure_all_accepted = per_test_common_llm_kwargs.get(
"model_name") == test_llm_kwargs.get("speculative_model") "model_name") == test_llm_kwargs.get("speculative_config")["model"]
run_equality_correctness_test(vllm_runner, run_equality_correctness_test(vllm_runner,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
...@@ -224,13 +231,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( ...@@ -224,13 +231,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -283,13 +294,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( ...@@ -283,13 +294,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -336,13 +351,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( ...@@ -336,13 +351,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -391,13 +410,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( ...@@ -391,13 +410,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -449,13 +472,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( ...@@ -449,13 +472,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -514,13 +541,17 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( ...@@ -514,13 +541,17 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
...@@ -567,21 +598,25 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, ...@@ -567,21 +598,25 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
# Artificially limit the draft model max model len; this forces vLLM # Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens. # to skip speculation once the sequences grow beyond 32-k tokens.
"speculative_max_model_len": 32, "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"max_model_len": 32,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 32,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4, "max_num_seqs": 4,
"speculative_max_model_len": 32,
}, },
]) ])
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
...@@ -627,15 +662,19 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -627,15 +662,19 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"speculative_disable_by_batch_size": 2, "disable_by_batch_size": 2,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
}, },
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"speculative_disable_by_batch_size": 2, "disable_by_batch_size": 2,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4, "max_num_seqs": 4,
...@@ -676,15 +715,19 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, ...@@ -676,15 +715,19 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": k, "num_speculative_tokens": k,
},
"enable_chunked_prefill": False, "enable_chunked_prefill": False,
} }
# Try a range of common k, as well as large speculation. # Try a range of common k, as well as large speculation.
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
] + [{ ] + [{
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": k, "num_speculative_tokens": k,
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4, "max_num_seqs": 4,
...@@ -729,17 +772,21 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -729,17 +772,21 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": k, "num_speculative_tokens": k,
"spec_decoding_acceptance_method": "typical_acceptance_sampler", "acceptance_method": "typical_acceptance_sampler",
},
"enable_chunked_prefill": False "enable_chunked_prefill": False
} }
# Try a range of common k. # Try a range of common k.
for k in [1, 2, 3] for k in [1, 2, 3]
] + [{ ] + [{
"speculative_model": "JackFram/llama-68m", "speculative_config": {
"model": "JackFram/llama-68m",
"num_speculative_tokens": k, "num_speculative_tokens": k,
"spec_decoding_acceptance_method": "typical_acceptance_sampler", "acceptance_method": "typical_acceptance_sampler",
},
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
"max_num_seqs": 4 "max_num_seqs": 4
......
...@@ -19,11 +19,11 @@ SPEC_MODEL = "JackFram/llama-160m" ...@@ -19,11 +19,11 @@ SPEC_MODEL = "JackFram/llama-160m"
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# speculative model # speculative config
"speculative_model": "JackFram/llama-160m", "speculative_config": {
"model": "JackFram/llama-160m",
# num speculative tokens
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
},
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
......
...@@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type): ...@@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type):
max_input_length=None, max_input_length=None,
) )
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
request_id="request_id", prompt="prompt", lora_request=None) prompt="prompt", lora_request=None)
assert reference_tokenizer.encode( assert reference_tokenizer.encode(
"prompt") == await tokenizer_group.encode_async( "prompt") == await tokenizer_group.encode_async(prompt="prompt",
request_id="request_id", prompt="prompt", lora_request=None) lora_request=None)
assert isinstance(tokenizer_group.get_lora_tokenizer(None), assert isinstance(tokenizer_group.get_lora_tokenizer(None),
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
assert tokenizer_group.get_lora_tokenizer( assert tokenizer_group.get_lora_tokenizer(
...@@ -69,8 +69,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type): ...@@ -69,8 +69,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
# and check that all requests are processed correctly. # and check that all requests are processed correctly.
num_requests = tokenizer_group_pool.pool_size * 5 num_requests = tokenizer_group_pool.pool_size * 5
requests = [ requests = [
tokenizer_group_pool.encode_async(request_id=str(i), tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
prompt=f"prompt {i}",
lora_request=None) lora_request=None)
for i in range(num_requests) for i in range(num_requests)
] ]
...@@ -161,12 +160,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): ...@@ -161,12 +160,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
fail_at[0] = 1000 fail_at[0] = 1000
# We should recover successfully. # We should recover successfully.
await tokenizer_group_pool.encode_async(request_id="1", await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
prompt="prompt", await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
lora_request=None)
await tokenizer_group_pool.encode_async(request_id="1",
prompt="prompt",
lora_request=None)
# Check that we have a new actor # Check that we have a new actor
assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors) assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
...@@ -184,8 +179,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): ...@@ -184,8 +179,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
# We should fail after re-initialization. # We should fail after re-initialization.
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):
await tokenizer_group_pool.encode_async(request_id="1", await tokenizer_group_pool.encode_async(prompt="prompt",
prompt="prompt",
lora_request=None) lora_request=None)
# check_health should raise the same thing # check_health should raise the same thing
...@@ -206,11 +200,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): ...@@ -206,11 +200,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
# Prompt too long error # Prompt too long error
with pytest.raises(ValueError): with pytest.raises(ValueError):
await tokenizer_group_pool.encode_async(request_id="1", await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
prompt="prompt" * 100,
lora_request=None)
await tokenizer_group_pool.encode_async(request_id="1",
prompt="prompt",
lora_request=None) lora_request=None)
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
# Actors should stay the same. # Actors should stay the same.
assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
...@@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: ...@@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
without enough resources, or called when filtering tests to run directly. without enough resources, or called when filtering tests to run directly.
""" """
try: try:
if current_platform.is_cpu() or current_platform.is_openvino(): if current_platform.is_cpu():
memory_gb = 0 memory_gb = 0
else: else:
memory_gb = current_platform.get_device_total_memory() / GB_bytes memory_gb = current_platform.get_device_total_memory() / GB_bytes
......
This diff is collapsed.
...@@ -6,7 +6,8 @@ import pytest ...@@ -6,7 +6,8 @@ import pytest
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.core.scheduler import Scheduler, SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
......
...@@ -70,12 +70,16 @@ def test_ngram_correctness( ...@@ -70,12 +70,16 @@ def test_ngram_correctness(
ref_outputs = ref_llm.chat(test_prompts, sampling_config) ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm del ref_llm
spec_llm = LLM(model=model_name, spec_llm = LLM(
speculative_model='[ngram]', model=model_name,
ngram_prompt_lookup_max=5, speculative_config={
ngram_prompt_lookup_min=3, "method": "ngram",
num_speculative_tokens=3, "prompt_lookup_max": 5,
max_model_len=1024) "prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config) spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0 matches = 0
misses = 0 misses = 0
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment