interfaces.py 3.68 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import enum
from abc import ABC, abstractmethod
6
from typing import List, Optional
7
from typing import Sequence as GenericSequence
8
from typing import Tuple
9
10

from vllm.sequence import Sequence, SequenceGroup
11
from vllm.utils import Device
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


class AllocStatus(enum.Enum):
    """Result for BlockSpaceManager.can_allocate

    1. Ok: seq_group can be allocated now.
    2. Later: seq_group cannot be allocated.
      The capacity of allocator is larger than seq_group required.
    3. Never: seq_group can never be allocated.
      The seq_group is too large to allocated in GPU.
    """
    OK = enum.auto()
    LATER = enum.auto()
    NEVER = enum.auto()


class BlockSpaceManager(ABC):

    @staticmethod
    def get_block_space_manager_class(version: str):
        version = version.lower()

34
35
36
        if version == "selfattn":
            from vllm.core.block_manager import SelfAttnBlockSpaceManager
            return SelfAttnBlockSpaceManager
37

38
39
40
41
        if version == "placeholder":
            from vllm.core.placeholder_block_space_manager import (
                PlaceholderBlockSpaceManager)
            return PlaceholderBlockSpaceManager
42

43
44
45
        raise ValueError(f"Unknown version {version=}")

    @abstractmethod
46
47
48
    def can_allocate(self,
                     seq_group: SequenceGroup,
                     num_lookahead_slots: int = 0) -> AllocStatus:
49
50
51
52
53
54
55
        pass

    @abstractmethod
    def allocate(self, seq_group: SequenceGroup) -> None:
        pass

    @abstractmethod
56
57
    def can_append_slots(self, seq_group: SequenceGroup,
                         num_lookahead_slots: int) -> bool:
58
59
60
        pass

    @abstractmethod
61
    def append_slots(
62
63
        self,
        seq: Sequence,
64
        num_lookahead_slots: int,
65
    ) -> List[Tuple[int, int]]:
66
67
68
69
70
71
72
        pass

    @abstractmethod
    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
        pass

    @abstractmethod
73
    def can_swap_in(self, seq_group: SequenceGroup,
74
                    num_lookahead_slots: int) -> AllocStatus:
75
76
77
        pass

    @abstractmethod
78
    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
79
80
81
82
83
84
85
        pass

    @abstractmethod
    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
        pass

    @abstractmethod
86
    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        pass

    @abstractmethod
    def free(self, seq: Sequence) -> None:
        pass

    @abstractmethod
    def get_block_table(self, seq: Sequence) -> List[int]:
        pass

    @abstractmethod
    def get_num_free_gpu_blocks(self) -> int:
        pass

    @abstractmethod
    def get_num_free_cpu_blocks(self) -> int:
        pass

    @abstractmethod
    def access_all_blocks_in_seq(
        self,
        seq: Sequence,
        access_time: float,
    ) -> None:
        pass

    @abstractmethod
114
115
    def get_common_computed_block_ids(
            self, seqs: List[Sequence]) -> GenericSequence[int]:
116
117
118
        pass

    @abstractmethod
119
120
    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
                                token_chunk_size: int):
121
        pass
122
123
124
125
126

    @abstractmethod
    def get_prefix_cache_hit_rate(self, device: Device) -> float:
        """Prefix cache hit rate. -1 means not supported or disabled."""
        pass
127

128
    @abstractmethod
129
130
    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        """Reset prefix cache for specified or all devices."""
131
132
        pass

133
134
135
    @abstractmethod
    def get_num_cached_tokens(self, seq: Sequence) -> int:
        pass
136
137
138
139

    @abstractmethod
    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
        pass