[Bugfix] Block manager v2 with preemption and lookahead slots (#8824)

5bf8789b · sroy745 · GitHub · d1537039 · 5bf8789b · 5bf8789b
Unverified Commit 5bf8789b authored Sep 28, 2024 by sroy745 Committed by GitHub Sep 29, 2024
9 changed files
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -23,8 +23,10 @@ MODELS = [
 @pytest.fixture(scope="module", autouse=True)
 def check_settings():
    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1, "
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1. "
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
+        "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 pytest "
        "tests/basic_correctness/test_preemption.py`")
@@ -199,6 +201,7 @@ def test_swap(
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
 @pytest.mark.parametrize("beam_width", [4])
+@pytest.mark.parametrize("use_v2_block_manager", [True, False])
 def test_swap_infeasible(
    vllm_runner,
    example_prompts,
@@ -207,6 +210,7 @@ def test_swap_infeasible(
    max_tokens: int,
    beam_width: int,
    worker_use_ray: bool,
+    use_v2_block_manager: bool,
 ) -> None:
    """Verify infeasible swap request will be ignored."""
    BLOCK_SIZE = 16
@@ -223,6 +227,7 @@ def test_swap_infeasible(
            num_gpu_blocks_override=prefill_blocks + decode_blocks,
            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
            worker_use_ray=worker_use_ray,
+            use_v2_block_manager=use_v2_block_manager,
    ) as vllm_model:
        sampling_params = SamplingParams(n=beam_width,
                                         use_beam_search=True,

--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -373,6 +373,52 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
+    """Verifies that swapping fails if there is not enough free blocks
+    to account for unseen tokens and lookahead_slots.
+    """
+    block_size = 8
+    num_cpu_blocks = 1
+    num_gpu_blocks = 1
+    block_manager = BlockSpaceManagerV2(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0,
+                                        enable_caching=enable_caching)
+    prompt_length = block_size - 3
+    assert prompt_length > 0
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+    # Swap seq group from GPU -> CPU.
+    assert block_manager.can_swap_out(seq_group)
+    block_manager.swap_out(seq_group)
+    prompt.status = SequenceStatus.SWAPPED
+    # Swap seq group from CPU -> GPU.
+    # The number of unseen tokens is 1. If the number of existing
+    # tokens plus the unseen ones and number of lookahead slots exceeds
+    # the total number of available GPU blocks then the swap
+    # should fail.
+    num_unseen_tokens = 1
+    if (num_lookahead_slots + num_unseen_tokens +
+            prompt_length) <= (block_size * num_gpu_blocks):
+        assert block_manager.can_swap_in(seq_group,
+                                         num_lookahead_slots) == AllocStatus.OK
+    else:
+        assert block_manager.can_swap_in(
+            seq_group, num_lookahead_slots) == AllocStatus.NEVER
 # TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@@ -400,7 +446,6 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
        if max_n is None:
            max_n = min_n
        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
-        #print("check", min_n, used, max_n)
        assert min_n <= used
        assert used <= max_n

--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -104,9 +104,9 @@ class TestNaiveBlockAllocator:
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [4])
    @pytest.mark.parametrize("block_size", [8])
-    def test_naive_block_get_num_blocks_touched(num_blocks, block_size):
+    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
        """ Verify the allocator can correctly return the number of
-        blocks touched, with different lookahead slots.
+        full blocks touched.
        """
        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
                                            num_blocks=num_blocks,
@@ -124,7 +124,7 @@ class TestNaiveBlockAllocator:
        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
        # All blocks are cached
-        assert allocator_dst.get_num_blocks_touched(
+        assert allocator_dst.get_num_full_blocks_touched(
            src_blocks) == num_blocks - 1
        # Insert one non-full block in the src
@@ -136,9 +136,10 @@ class TestNaiveBlockAllocator:
        src_blocks.append(allocate_non_full_block())
        src_blocks[-1].append_token_ids([0])
-        assert allocator_dst.get_num_blocks_touched(
+        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks, num_lookahead_slots=1) == num_blocks
+            src_blocks) == num_blocks - 1
-        assert allocator_dst.get_num_blocks_touched(
+        # Fill up the last source block and then invoke
-            src_blocks, num_lookahead_slots=block_size - 1) == num_blocks
+        # get_num_blocks_touched
-        assert allocator_dst.get_num_blocks_touched(
+        src_blocks[-1].append_token_ids([0] * (block_size - 1))
-            src_blocks, num_lookahead_slots=block_size) == (num_blocks + 1)
+        assert allocator_dst.get_num_full_blocks_touched(
+            src_blocks) == num_blocks
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -318,11 +318,10 @@ class TestPrefixCachingBlockAllocator:
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [4])
    @pytest.mark.parametrize("block_size", [8])
-    def test_prefix_caching_block_get_num_blocks_touched(
+    def test_prefix_caching_block_get_num_full_blocks_touched(
            num_blocks, block_size):
        """ Verify the allocator can correctly return the number of
-        blocks touched, when there are cached prefixes and different
+        blocks touched, when there are cached prefixes.
-        lookahead slots.
        """
        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                    block_size=block_size)
@@ -346,28 +345,30 @@ class TestPrefixCachingBlockAllocator:
                token_ids=token_ids,
                allocator=allocator_src,
            )
        # All blocks are cached
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 0
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 0
        # Free the first block in the dst
        allocator_dst.free(cached_blocks[0])
        # Now the first block becomes dangling, the swapped blocks need
        # to reclaim the first block in the dst
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 1
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 1
        # Insert one non-full block in the src
        non_full_block = allocator_src.allocate_mutable_block(
            blocks_to_swap_in[-1])
        non_full_block.append_token_ids([0])
        blocks_to_swap_in.append(non_full_block)
-        assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in,
+        assert allocator_dst.get_num_full_blocks_touched(
-                                                    num_lookahead_slots=1) == 2
+            blocks_to_swap_in) == 1
-        assert allocator_dst.get_num_blocks_touched(
+        # Fill up the last mutable block and invoke get_num_blocks_touched.
-            blocks_to_swap_in, num_lookahead_slots=block_size - 1) == 2
+        # Note: The last block is not cached so it will be touched.
-        assert allocator_dst.get_num_blocks_touched(
+        non_full_block.append_token_ids([0] * (block_size - 1))
-            blocks_to_swap_in, num_lookahead_slots=block_size) == 3
+        assert allocator_dst.get_num_full_blocks_touched(
+            blocks_to_swap_in) == 2
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [1024])

--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -259,25 +259,22 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
                current_swap_mapping[src_block_id] = dst_block_id
        return current_swap_mapping
-    def get_num_blocks_touched(self,
+    def get_num_full_blocks_touched(self, blocks: List[Block],
-                               blocks: List[Block],
+                                    device: Device) -> int:
-                               device: Device,
+        """Returns the number of full blocks that will be touched by
-                               num_lookahead_slots: int = 0) -> int:
-        """Returns the number of blocks that will be touched by
        swapping in/out the given blocks on to the 'device'.
        Args:
            blocks: List of blocks to be swapped.
            device (Device): Device to swap the 'blocks' on.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
        Returns:
-            int: the number of blocks that will be touched by
+            int: the number of full blocks that will be touched by
                swapping in/out the given blocks on to the 'device'.
+                Non full blocks are ignored when deciding the number
+                of blocks to touch.
        """
-        return self._allocators[device].get_num_blocks_touched(
+        return self._allocators[device].get_num_full_blocks_touched(blocks)
-            blocks, num_lookahead_slots)
    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
        """Clears the copy-on-write (CoW) state and returns the mapping of

--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -181,9 +181,7 @@ class BlockAllocator(ABC):
        pass
    @abstractmethod
-    def get_num_blocks_touched(self,
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-                               blocks: List[Block],
-                               num_lookahead_slots: int = 0) -> int:
        pass
    @abstractmethod
@@ -260,10 +258,8 @@ class DeviceAwareBlockAllocator(ABC):
        pass
    @abstractmethod
-    def get_num_blocks_touched(self,
+    def get_num_full_blocks_touched(self, blocks: List[Block],
-                               blocks: List[Block],
+                                    device: Device) -> int:
-                               device: Device,
-                               num_lookahead_slots: int = 0) -> int:
        pass
    @abstractmethod

--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -4,7 +4,6 @@ from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
 from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
                                    get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
-from vllm.utils import cdiv
 Refcount = int
@@ -282,40 +281,26 @@ class NaiveBlockAllocator(BlockAllocator):
    def promote_to_immutable_block(self, block: Block) -> BlockId:
        raise NotImplementedError("There is no promotion for naive blocks")
-    def get_num_blocks_touched(self,
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-                               blocks: List[Block],
+        """Returns the number of full blocks that will be touched by
-                               num_lookahead_slots: int = 0) -> int:
+        swapping in/out.
-        """Determine the number of blocks that will be touched by
-        swapping in/out the given blocks from certain sequence
-        group with the provided num_lookahead_slots.
        Args:
-            blocks (List[Block]): The potential blocks to swap.
+            blocks: List of blocks to be swapped.
-            num_lookahead_slots (int): number of lookahead slots (0 for swap 
-                out).
        Returns:
-            int: the number of blocks that will be touched by
+            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks and num_lookahead_slots.
+                swapping in/out the given blocks. Non full blocks are ignored
+                when deciding the number of blocks to touch.
        """
        # NOTE: for naive block, we use set to eliminate common blocks among
        # seqs, also we compare the empty slots in the mutable blocks with
        # lookahead slots to get the number of unique new block that are
        # needed.
        old_block_set = set()
-        new_block_count = 0
-        # TODO(cade): make sure the logic is correct and clean it up.
        for block in blocks:
-            if not block.is_full and num_lookahead_slots != 0:
+            if block.is_full:
-                new_block_count += 1
+                old_block_set.add(block)
-                if num_lookahead_slots > block.num_empty_slots:
+        return len(old_block_set)
-                    new_block_count += cdiv(
-                        num_lookahead_slots - block.num_empty_slots,
-                        self._block_size)
-            else:
-                old_block_set.add(block.block_id)
-        num_touched_blocks = new_block_count + len(old_block_set)
-        return num_touched_blocks
    def swap_out(self, blocks: List[Block]) -> None:
        for block in blocks:

--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -8,7 +8,6 @@ from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
                                         NaiveBlockAllocator)
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
-from vllm.utils import cdiv
 PrefixHash = int
@@ -576,36 +575,26 @@ class PrefixCachingBlockAllocator(BlockAllocator):
            if ids
        ])
-    def get_num_blocks_touched(self,
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-                               blocks: List[Block],
+        """Returns the number of full blocks that will be touched by
-                               num_lookahead_slots: int = 0) -> int:
+        swapping in/out.
-        """Determine the number of blocks that will be touched by
-        swapping in/out the given blocks from certain sequence
-        group with the provided num_lookahead_slots.
        Args:
-            blocks (List[Block]): The potential blocks to swap.
+            blocks: List of blocks to be swapped.
-            num_lookahead_slots (int): number of lookahead slots (0 for 
-                swap out).
        Returns:
-            int: the number of blocks that will be touched by
+            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks and num_lookahead_slots.
+                swapping in/out the given blocks. Non full blocks are ignored
+                when deciding the number of blocks to touch.
        """
-        num_touched_blocks = 0
+        num_touched_blocks: int = 0
        for block in blocks:
-            if not block.is_full:
+            # If the block has a match in the cache and the cached
-                num_touched_blocks += 1
+            # block is not referenced, then we still count it as a
-                if num_lookahead_slots > block.num_empty_slots:
+            # touched block
-                    num_touched_blocks += cdiv(
+            if block.is_full and (not self.is_block_cached(block) or \
-                        num_lookahead_slots - block.num_empty_slots,
-                        self._block_size)
-            else:
-                # If the block has a match in the cache and the cached block
-                # is not referenced, then we still count it as a touched block
-                if not self.is_block_cached(block) or \
                (block.content_hash is not None and \
-                     self._cached_blocks[block.content_hash] in self.evictor):
+                self._cached_blocks[block.content_hash] in \
+                        self.evictor)):
                num_touched_blocks += 1
        return num_touched_blocks

--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
 """A block manager that manages token blocks."""
-from itertools import chain
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple
@@ -470,12 +469,31 @@ class BlockSpaceManagerV2(BlockSpaceManager):
            AllocStatus: The AllocStatus for swapping in/out the given 
                sequence_group on to the 'device'.
        """
-        blocks = self._get_blocks_for_swap(seq_group, status)
+        # First determine the number of blocks that will be touched by this
-        num_blocks_touched = self.block_allocator.get_num_blocks_touched(
+        # swap. Then verify if there are available blocks in the device
-            blocks, device, num_lookahead_slots)
+        # to perform the swap.
+        num_blocks_touched = 0
+        blocks: List[Block] = []
+        for seq in seq_group.get_seqs(status=status):
+            block_table = self.block_tables[seq.seq_id]
+            if block_table.blocks is not None:
+                # Compute the number blocks to touch for the tokens to be
+                # appended. This does NOT include the full blocks that need
+                # to be touched for the swap.
+                num_blocks_touched += \
+                    block_table.get_num_blocks_touched_by_append_slots(
+                        block_table.get_unseen_token_ids(seq.get_token_ids()),
+                        num_lookahead_slots=num_lookahead_slots)
+                blocks.extend(block_table.blocks)
+        # Compute the number of full blocks to touch and add it to the
+        # existing count of blocks to touch.
+        num_blocks_touched += self.block_allocator.get_num_full_blocks_touched(
+            blocks, device=device)
        watermark_blocks = 0
        if device == Device.GPU:
            watermark_blocks = self.watermark_blocks
        if self.block_allocator.get_num_total_blocks(
                device) < num_blocks_touched:
            return AllocStatus.NEVER
@@ -484,23 +502,3 @@ class BlockSpaceManagerV2(BlockSpaceManager):
            return AllocStatus.OK
        else:
            return AllocStatus.LATER
-    def _get_blocks_for_swap(self, seq_group: SequenceGroup,
-                             status: SequenceStatus) -> List[Block]:
-        """Returns the list of blocks those are touched by the seq_group
-        Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
-            status (SequenceStatus): The status of sequence which is needed
-                for action. RUNNING for swap out and SWAPPED for swap in
-        Returns:
-            The list of blocks those are touched by the seq_group.
-        """
-        blocks: Dict[int, List[Block]] = {}
-        for seq in seq_group.get_seqs(status=status):
-            block_table = self.block_tables[seq.seq_id]
-            if block_table.blocks is not None:
-                blocks[seq.seq_id] = block_table.blocks
-        combined_blocks = list(chain(*blocks.values()))
-        return combined_blocks