test_prefix_caching.py 2.85 KB
Newer Older
1
2
3
4
5
6
"""Compare the with and without prefix caching.

Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
import pytest

7
8
9
10
11
12
13
14
15
from vllm.core.block_manager import BlockAllocator
from vllm.utils import Device


@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_blocks", [16])
def test_block_allocator(
    block_size: int,
    num_blocks: int,
16
):
17
18
19
20
21
22
    block_hash = 1
    block_allocator = BlockAllocator(Device.CPU,
                                     block_size,
                                     num_blocks,
                                     enable_caching=True)

23
24
    # Allocate two PysicalTokenBlocks with the same hash and check
    # that they are the same PhysicalTokenBlock
25
26
27
28
29
    first_block = block_allocator.allocate(block_hash, 0)
    second_block = block_allocator.allocate(block_hash, 0)
    assert (first_block == second_block)
    assert (second_block.ref_count == 2)

30
31
    # Free the first_block and confirm that the ref_count is correctly
    # decremented on the second block
32
33
34
35
36
37
    block_allocator.free(first_block)
    assert (second_block.ref_count == 1)

    # Free the second block
    block_allocator.free(second_block)

38
39
    # Reallocate the first block and confirm that, even after the block
    # had its ref_count go to 0, we still get the same block back
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
    first_block = block_allocator.allocate(block_hash, 0)
    assert (first_block == second_block)
    assert (first_block.block_hash == block_hash)


@pytest.mark.parametrize("num_blocks", [16])
def test_eviction(num_blocks: int, ):
    block_size = 16
    block_allocator = BlockAllocator(Device.CPU,
                                     block_size,
                                     num_blocks,
                                     enable_caching=True)
    blocks = []

    for i in range(num_blocks):
        # use i as the block_hash
        blocks.append(block_allocator.allocate(i, 0))

    #Free all blocks
    for block in blocks:
        block_allocator.free(block)

62
63
    # Allocate a new block and confirm that it's the first block freed.
    # I.E The Least Recently Used block
64
65
66
67
68
69
70
71
72
73
74
    new_block_hash = block_size
    new_block = block_allocator.allocate(new_block_hash, 0)
    assert (new_block == blocks[0])
    assert (new_block.block_hash == new_block_hash)

    # Reallocate the second in blocks to remove it from the free list
    realloc_block_hash = 1
    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
    assert (realloc_block == blocks[realloc_block_hash])
    assert (realloc_block.block_hash == realloc_block_hash)

75
76
    # Allocate a new block and confirm that it's not the realloc_block,
    # since the realloc_block shouldn't be in the free list
77
78
79
80
81
    new_block_hash = block_size + 1
    new_block = block_allocator.allocate(new_block_hash, 0)
    assert (realloc_block != new_block)
    assert (new_block.block_hash == new_block_hash)
    assert (new_block.block_number == 2)