Fix some speculative decode tests with tl.dot (#17371)

Signed-off-by: Huy Do <huydhn@gmail.com>

Fix some speculative decode tests with tl.dot (#17371)
Signed-off-by: Huy Do <huydhn@gmail.com>
88fcf00d · Huy Do · GitHub · d1f569b1 · 88fcf00d
Unverified Commit 88fcf00d authored Apr 29, 2025 by Huy Do Committed by GitHub Apr 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 6 deletions

tests/spec_decode/e2e/test_multistep_correctness.py tests/spec_decode/e2e/test_multistep_correctness.py +3 -6

No files found.
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [
-        # As of this writing, vLLM only compiles with these 3 block sizes by
-        # default.
-        {
-            "block_size": 8,
-        },
+        # https://github.com/triton-lang/triton/issues/2266 tl.dot
+        # doesn't support embedding < 16
        {
            "block_size": 16,
        },