[Core] Fix edge case in chunked prefill + block manager v2 (#7380)

baa24025 · Cade Daniel · GitHub · 999ef0b9 · baa24025 · baa24025
Unverified Commit baa24025 authored Aug 09, 2024 by Cade Daniel Committed by GitHub Aug 09, 2024
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 3 deletions

tests/core/block/e2e/test_correctness.py tests/core/block/e2e/test_correctness.py +15 -3

vllm/core/block/block_table.py vllm/core/block/block_table.py +6 -0

No files found.
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
            # skip cuda graph creation for fast test.
            "enforce_eager": True,
            "enable_chunked_prefill": True,
-            "max_num_batched_tokens": 2,
-            "max_num_seqs": 2,
        },
    ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs",
+                         [{
+                             "block_size": 8,
+                             "max_num_batched_tokens": 2,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 8,
+                             "max_num_batched_tokens": 3,
+                             "max_num_seqs": 2,
+                         }, {
+                             "block_size": 8,
+                             "max_num_batched_tokens": 256,
+                             "max_num_seqs": 10,
+                         }])
 @pytest.mark.parametrize("baseline_llm_kwargs", [
    {
        "use_v2_block_manager": False,
@@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
+        ("1 + " * 50) + " 1 = ",  # Longer prompt.
        "The capital of France is",
        "The future of AI is",
    ]

--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -356,7 +356,13 @@ class BlockTable:
        appended to blocks. The first such "token block" may have less token ids
        than the block size, since the last allocated block may be partially
        full.
+
+        If no token ids are provided, then no chunks are returned.
        """
+
+        if not token_ids:
+            return []
+
        first_chunk_size = self._block_size - (self._num_full_slots %
                                               self._block_size)
        token_blocks = [token_ids[:first_chunk_size]]