interfaces.py 3.45 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
import enum
from abc import ABC, abstractmethod
5
from typing import List
6
from typing import Sequence as GenericSequence
7
from typing import Tuple
8
9

from vllm.sequence import Sequence, SequenceGroup
10
from vllm.utils import Device
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


class AllocStatus(enum.Enum):
    """Result for BlockSpaceManager.can_allocate

    1. Ok: seq_group can be allocated now.
    2. Later: seq_group cannot be allocated.
      The capacity of allocator is larger than seq_group required.
    3. Never: seq_group can never be allocated.
      The seq_group is too large to allocated in GPU.
    """
    OK = enum.auto()
    LATER = enum.auto()
    NEVER = enum.auto()


class BlockSpaceManager(ABC):

    @staticmethod
    def get_block_space_manager_class(version: str):
        version = version.lower()

33
34
35
        if version == "selfattn":
            from vllm.core.block_manager import SelfAttnBlockSpaceManager
            return SelfAttnBlockSpaceManager
36

37
38
39
40
        if version == "placeholder":
            from vllm.core.placeholder_block_space_manager import (
                PlaceholderBlockSpaceManager)
            return PlaceholderBlockSpaceManager
41

42
43
44
        raise ValueError(f"Unknown version {version=}")

    @abstractmethod
45
46
47
    def can_allocate(self,
                     seq_group: SequenceGroup,
                     num_lookahead_slots: int = 0) -> AllocStatus:
48
49
50
51
52
53
54
        pass

    @abstractmethod
    def allocate(self, seq_group: SequenceGroup) -> None:
        pass

    @abstractmethod
55
56
    def can_append_slots(self, seq_group: SequenceGroup,
                         num_lookahead_slots: int) -> bool:
57
58
59
        pass

    @abstractmethod
60
    def append_slots(
61
62
        self,
        seq: Sequence,
63
        num_lookahead_slots: int,
64
    ) -> List[Tuple[int, int]]:
65
66
67
68
69
70
71
        pass

    @abstractmethod
    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
        pass

    @abstractmethod
72
    def can_swap_in(self, seq_group: SequenceGroup,
73
                    num_lookahead_slots: int) -> AllocStatus:
74
75
76
        pass

    @abstractmethod
77
    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
78
79
80
81
82
83
84
        pass

    @abstractmethod
    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
        pass

    @abstractmethod
85
    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
        pass

    @abstractmethod
    def free(self, seq: Sequence) -> None:
        pass

    @abstractmethod
    def get_block_table(self, seq: Sequence) -> List[int]:
        pass

    @abstractmethod
    def get_num_free_gpu_blocks(self) -> int:
        pass

    @abstractmethod
    def get_num_free_cpu_blocks(self) -> int:
        pass

    @abstractmethod
    def access_all_blocks_in_seq(
        self,
        seq: Sequence,
        access_time: float,
    ) -> None:
        pass

    @abstractmethod
113
114
    def get_common_computed_block_ids(
            self, seqs: List[Sequence]) -> GenericSequence[int]:
115
116
117
        pass

    @abstractmethod
118
119
    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
                                token_chunk_size: int):
120
        pass
121
122
123
124
125

    @abstractmethod
    def get_prefix_cache_hit_rate(self, device: Device) -> float:
        """Prefix cache hit rate. -1 means not supported or disabled."""
        pass
126

127
128
129
130
131
    @abstractmethod
    def reset_prefix_cache(self) -> bool:
        """Reset prefix cache for all devices."""
        pass

132
133
134
    @abstractmethod
    def get_num_cached_tokens(self, seq: Sequence) -> int:
        pass