[Testing] Fix core tests (#3224)

a33ce60c · Cade Daniel · GitHub · 24aecf42 · a33ce60c · a33ce60c
Unverified Commit a33ce60c authored Mar 06, 2024 by Cade Daniel Committed by GitHub Mar 06, 2024
4 changed files
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -6,7 +6,7 @@ from vllm import SamplingParams
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
 from vllm.utils import Device
-from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
+from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
 from .utils import create_dummy_prompt
@@ -22,7 +22,8 @@ def test_block_allocator_allocate():
    for _ in range(num_cpu_blocks):
        block = cpu_allocator.allocate()
        num_free -= 1
-        assert block not in cpu_allocator.free_blocks
+        assert block.block_hash not in cpu_allocator.evictor
        assert cpu_allocator.get_num_free_blocks() == num_free
    with pytest.raises(ValueError):
@@ -39,7 +40,7 @@ def test_block_allocator_free():
    for _ in range(num_cpu_blocks):
        block = cpu_allocator.allocate()
        blocks.append(block)
-        assert block not in cpu_allocator.free_blocks
+        assert block.block_hash not in cpu_allocator.evictor
    # Free all allocated cpu blocks.
    num_free = 0
@@ -47,7 +48,7 @@ def test_block_allocator_free():
    for block in blocks:
        cpu_allocator.free(block)
        num_free += 1
-        assert block in cpu_allocator.free_blocks
+        assert block.block_hash in cpu_allocator.evictor
        assert cpu_allocator.get_num_free_blocks() == num_free
        with pytest.raises(ValueError):
@@ -106,7 +107,7 @@ def test_append_slot_single_seq():
    # Add block_size number of new tokens and append slot.
    for i in range(block_size):
        token_id = i + 5
-        prompt.append_token_id(token_id, {token_id: 0.0})
+        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
    assert block_manager.can_append_slot(seq_group)
    before_blocks = block_manager.get_num_free_gpu_blocks()
@@ -119,25 +120,37 @@ def test_append_slot_cow():
    block_size = 4
    num_cpu_blocks = 4
    num_gpu_blocks = 4
-    block_manager = BlockSpaceManager(block_size,
+    block_manager = BlockSpaceManager(block_size=block_size,
-                                      num_cpu_blocks,
+                                      num_cpu_blocks=num_cpu_blocks,
-                                      num_gpu_blocks,
+                                      num_gpu_blocks=num_gpu_blocks,
                                      watermark=0)
-    # Allocate prompt to gpu block.
+    # Allocate prompt to gpu block. There is one slot left in the block.
-    prompt = Sequence(1, "one two three", [1, 2, 3], block_size)
+    prompt = Sequence(seq_id=1,
-    child = prompt.fork(2)
+                      prompt="one two three",
-    token_id = 4
+                      prompt_token_ids=[1, 2, 3],
-    child.append_token_id(token_id, {token_id: 0.0})
+                      block_size=block_size)
+    # Fork the sequence, such that a COW will be required when we append a new
+    # token id.
+    child = prompt.fork(new_seq_id=2)
+    # Allocate space for the sequence group.
    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
                              time.time(), time.perf_counter)
    block_manager.allocate(seq_group)
-    # Append slot for child token.
+    # Fork and append a new token id. We expect a COW to be scheduled.
-    # Last block being modified is shared. Copy on write occurs.
+    token_id = 4
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.fork(prompt, child)
    assert block_manager.can_append_slot(seq_group)
    before_blocks = block_manager.get_num_free_gpu_blocks()
-    src_block, dst_block = block_manager.append_slot(child)
+    maybe_src_dst_block = block_manager.append_slot(child)
+    assert maybe_src_dst_block is not None
+    src_block, dst_block = maybe_src_dst_block
    assert src_block != dst_block
    after_blocks = block_manager.get_num_free_gpu_blocks()
@@ -165,7 +178,7 @@ def test_fork():
        prompt) == block_manager.get_block_table(child)
    token_id = 4
    # Append token to child. Block is shared so copy on write occurs.
-    child.append_token_id(token_id, {token_id: 0.0})
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
    block_manager.append_slot(child)
    assert block_manager.get_block_table(
        prompt) != block_manager.get_block_table(child)
@@ -189,7 +202,7 @@ def test_swap():
    # tokens will be written in the next forward pass.
    token_id = 0
    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: 0.0})
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
    # Swap seq group from GPU -> CPU.
    gpu_blocks = block_manager.get_block_table(prompt)

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -3,7 +3,7 @@ import pytest  # noqa
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.scheduler import Scheduler
-from vllm.sequence import SequenceGroup
+from vllm.sequence import SequenceGroup, Logprob
 from .utils import create_dummy_prompt
@@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort():
    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
    # processed.
    token_id = 0
-    seq_a.append_token_id(token_id, {token_id: 0.0})
+    seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
-    seq_b.append_token_id(token_id, {token_id: 0.0})
+    seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
    # Schedule seq groups generation and preempt seq group b.
    seq_group_meta, out = scheduler.schedule()

--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -18,7 +18,7 @@ def create_dummy_prompt(
    prompt_str = " ".join([str(t) for t in prompt_tokens])
    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
    seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
-                              time.time(), None, None)
+                              time.time(), None)
    return prompt, seq_group

--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -142,7 +142,7 @@ class Sequence:
        prompt: str,
        prompt_token_ids: List[int],
        block_size: int,
-        eos_token_id: int,
+        eos_token_id: Optional[int] = None,
        lora_request: Optional[LoRARequest] = None,
    ) -> None:
        self.seq_id = seq_id