Unverified Commit a33ce60c authored by Cade Daniel's avatar Cade Daniel Committed by GitHub
Browse files

[Testing] Fix core tests (#3224)

parent 24aecf42
...@@ -6,7 +6,7 @@ from vllm import SamplingParams ...@@ -6,7 +6,7 @@ from vllm import SamplingParams
from vllm.block import PhysicalTokenBlock from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
from vllm.utils import Device from vllm.utils import Device
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
...@@ -22,7 +22,8 @@ def test_block_allocator_allocate(): ...@@ -22,7 +22,8 @@ def test_block_allocator_allocate():
for _ in range(num_cpu_blocks): for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate() block = cpu_allocator.allocate()
num_free -= 1 num_free -= 1
assert block not in cpu_allocator.free_blocks
assert block.block_hash not in cpu_allocator.evictor
assert cpu_allocator.get_num_free_blocks() == num_free assert cpu_allocator.get_num_free_blocks() == num_free
with pytest.raises(ValueError): with pytest.raises(ValueError):
...@@ -39,7 +40,7 @@ def test_block_allocator_free(): ...@@ -39,7 +40,7 @@ def test_block_allocator_free():
for _ in range(num_cpu_blocks): for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate() block = cpu_allocator.allocate()
blocks.append(block) blocks.append(block)
assert block not in cpu_allocator.free_blocks assert block.block_hash not in cpu_allocator.evictor
# Free all allocated cpu blocks. # Free all allocated cpu blocks.
num_free = 0 num_free = 0
...@@ -47,7 +48,7 @@ def test_block_allocator_free(): ...@@ -47,7 +48,7 @@ def test_block_allocator_free():
for block in blocks: for block in blocks:
cpu_allocator.free(block) cpu_allocator.free(block)
num_free += 1 num_free += 1
assert block in cpu_allocator.free_blocks assert block.block_hash in cpu_allocator.evictor
assert cpu_allocator.get_num_free_blocks() == num_free assert cpu_allocator.get_num_free_blocks() == num_free
with pytest.raises(ValueError): with pytest.raises(ValueError):
...@@ -106,7 +107,7 @@ def test_append_slot_single_seq(): ...@@ -106,7 +107,7 @@ def test_append_slot_single_seq():
# Add block_size number of new tokens and append slot. # Add block_size number of new tokens and append slot.
for i in range(block_size): for i in range(block_size):
token_id = i + 5 token_id = i + 5
prompt.append_token_id(token_id, {token_id: 0.0}) prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
assert block_manager.can_append_slot(seq_group) assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks() before_blocks = block_manager.get_num_free_gpu_blocks()
...@@ -119,25 +120,37 @@ def test_append_slot_cow(): ...@@ -119,25 +120,37 @@ def test_append_slot_cow():
block_size = 4 block_size = 4
num_cpu_blocks = 4 num_cpu_blocks = 4
num_gpu_blocks = 4 num_gpu_blocks = 4
block_manager = BlockSpaceManager(block_size, block_manager = BlockSpaceManager(block_size=block_size,
num_cpu_blocks, num_cpu_blocks=num_cpu_blocks,
num_gpu_blocks, num_gpu_blocks=num_gpu_blocks,
watermark=0) watermark=0)
# Allocate prompt to gpu block. # Allocate prompt to gpu block. There is one slot left in the block.
prompt = Sequence(1, "one two three", [1, 2, 3], block_size) prompt = Sequence(seq_id=1,
child = prompt.fork(2) prompt="one two three",
token_id = 4 prompt_token_ids=[1, 2, 3],
child.append_token_id(token_id, {token_id: 0.0}) block_size=block_size)
# Fork the sequence, such that a COW will be required when we append a new
# token id.
child = prompt.fork(new_seq_id=2)
# Allocate space for the sequence group.
seq_group = SequenceGroup("1", [prompt, child], SamplingParams(), seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
time.time(), time.perf_counter) time.time(), time.perf_counter)
block_manager.allocate(seq_group) block_manager.allocate(seq_group)
# Append slot for child token. # Fork and append a new token id. We expect a COW to be scheduled.
# Last block being modified is shared. Copy on write occurs. token_id = 4
child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.fork(prompt, child)
assert block_manager.can_append_slot(seq_group) assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks() before_blocks = block_manager.get_num_free_gpu_blocks()
src_block, dst_block = block_manager.append_slot(child)
maybe_src_dst_block = block_manager.append_slot(child)
assert maybe_src_dst_block is not None
src_block, dst_block = maybe_src_dst_block
assert src_block != dst_block assert src_block != dst_block
after_blocks = block_manager.get_num_free_gpu_blocks() after_blocks = block_manager.get_num_free_gpu_blocks()
...@@ -165,7 +178,7 @@ def test_fork(): ...@@ -165,7 +178,7 @@ def test_fork():
prompt) == block_manager.get_block_table(child) prompt) == block_manager.get_block_table(child)
token_id = 4 token_id = 4
# Append token to child. Block is shared so copy on write occurs. # Append token to child. Block is shared so copy on write occurs.
child.append_token_id(token_id, {token_id: 0.0}) child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(child) block_manager.append_slot(child)
assert block_manager.get_block_table( assert block_manager.get_block_table(
prompt) != block_manager.get_block_table(child) prompt) != block_manager.get_block_table(child)
...@@ -189,7 +202,7 @@ def test_swap(): ...@@ -189,7 +202,7 @@ def test_swap():
# tokens will be written in the next forward pass. # tokens will be written in the next forward pass.
token_id = 0 token_id = 0
prompt.status = SequenceStatus.RUNNING prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: 0.0}) prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU. # Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt) gpu_blocks = block_manager.get_block_table(prompt)
......
...@@ -3,7 +3,7 @@ import pytest # noqa ...@@ -3,7 +3,7 @@ import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.sequence import SequenceGroup from vllm.sequence import SequenceGroup, Logprob
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
...@@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort(): ...@@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort():
# Append "generated" tokens, allowing the sequence to mark prompt tokens as # Append "generated" tokens, allowing the sequence to mark prompt tokens as
# processed. # processed.
token_id = 0 token_id = 0
seq_a.append_token_id(token_id, {token_id: 0.0}) seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
seq_b.append_token_id(token_id, {token_id: 0.0}) seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
# Schedule seq groups generation and preempt seq group b. # Schedule seq groups generation and preempt seq group b.
seq_group_meta, out = scheduler.schedule() seq_group_meta, out = scheduler.schedule()
......
...@@ -18,7 +18,7 @@ def create_dummy_prompt( ...@@ -18,7 +18,7 @@ def create_dummy_prompt(
prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt_str = " ".join([str(t) for t in prompt_tokens])
prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
seq_group = SequenceGroup(request_id, [prompt], SamplingParams(), seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
time.time(), None, None) time.time(), None)
return prompt, seq_group return prompt, seq_group
......
...@@ -142,7 +142,7 @@ class Sequence: ...@@ -142,7 +142,7 @@ class Sequence:
prompt: str, prompt: str,
prompt_token_ids: List[int], prompt_token_ids: List[int],
block_size: int, block_size: int,
eos_token_id: int, eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
) -> None: ) -> None:
self.seq_id = seq_id self.seq_id = seq_id
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment