Unverified Commit 88fcf00d authored by Huy Do's avatar Huy Do Committed by GitHub
Browse files

Fix some speculative decode tests with tl.dot (#17371)


Signed-off-by: default avatarHuy Do <huydhn@gmail.com>
parent d1f569b1
...@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( ...@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"block_size": 8, "block_size": 16,
# 2 for small prompt, 256//8 for generated. # 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8, "num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8, "max_model_len": (2 + 256 // 8) * 8,
...@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( ...@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
# As of this writing, vLLM only compiles with these 3 block sizes by # https://github.com/triton-lang/triton/issues/2266 tl.dot
# default. # doesn't support embedding < 16
{
"block_size": 8,
},
{ {
"block_size": 16, "block_size": 16,
}, },
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment