"vscode:/vscode.git/clone" did not exist on "d53988f6195e4e1afbe17f64d2966209e48d7152"
scheduler.py 7.73 KB
Newer Older
Woosuk Kwon's avatar
Woosuk Kwon committed
1
2
3
4
5
6
7
8
9
10
from typing import Dict, List, Tuple

from cacheflow.master.block_manager import BlockSpaceManager
from cacheflow.sequence import Sequence
from cacheflow.sequence import SequenceGroup
from cacheflow.sequence import SequenceStatus


class Scheduler:

Woosuk Kwon's avatar
Woosuk Kwon committed
11
    def __init__(
Woosuk Kwon's avatar
Woosuk Kwon committed
12
        self,
Woosuk Kwon's avatar
Woosuk Kwon committed
13
        controllers: List,
Woosuk Kwon's avatar
Woosuk Kwon committed
14
15
16
17
        block_size: int,
        num_gpu_blocks: int,
        num_cpu_blocks: int,
    ) -> None:
Woosuk Kwon's avatar
Woosuk Kwon committed
18
19
20
21
22
23
        self.controllers = controllers
        self.block_size = block_size
        self.num_gpu_blocks = num_gpu_blocks
        self.num_cpu_blocks = num_cpu_blocks

        # Create the block space manager.
Woosuk Kwon's avatar
Woosuk Kwon committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
        self.block_manager = BlockSpaceManager(
            block_size=block_size,
            num_gpu_blocks=num_gpu_blocks,
            num_cpu_blocks=num_cpu_blocks,
        )

        # Serving sequence groups (FIFO).
        self.serving: List[SequenceGroup] = []
        # Mapping: group_id -> num_steps.
        self.num_steps: Dict[int, int] = {}
        # Mapping: group_id -> max_num_steps.
        self.max_num_steps: Dict[int, int] = {}
        # Mapping: group_id -> stop_token_ids.
        self.stop_token_ids: Dict[int, List[int]] = {}

        # Swapped sequence groups (LIFO).
        self.swapped: List[SequenceGroup] = []
        # Pending sequence groups (FIFO).
Woosuk Kwon's avatar
Woosuk Kwon committed
42
43
44
        self.pending: List[SequenceGroup] = []

        # Blocks that need to be swaped or copied before model execution.
Woosuk Kwon's avatar
Woosuk Kwon committed
45
46
47
        self.blocks_to_swap_in: Dict[int, int] = {}
        self.blocks_to_swap_out: Dict[int, int] = {}
        self.blocks_to_copy: Dict[int, int] = {}
Woosuk Kwon's avatar
Woosuk Kwon committed
48
49
50
51
52
53
54
55
56
57

    def _free_seq(self, seq: Sequence) -> None:
        seq.status = SequenceStatus.FINISHED
        self.block_manager.free(seq)

    def _allocate(self, seq_group: SequenceGroup) -> None:
        self.block_manager.allocate(seq_group)
        for seq in seq_group.seqs:
            seq.status = SequenceStatus.RUNNING
        self.serving.append(seq_group)
Woosuk Kwon's avatar
Woosuk Kwon committed
58
59
        # FIXME
        self.num_steps[seq_group.group_id] = 0
Woosuk Kwon's avatar
Woosuk Kwon committed
60
61
62
63
64
65
66
67

    def _append(self, seq_group: SequenceGroup) -> None:
        for seq in seq_group.seqs:
            if seq.status == SequenceStatus.FINISHED:
                continue
            ret = self.block_manager.append(seq)
            if ret is not None:
                src_block, dst_block = ret
Woosuk Kwon's avatar
Woosuk Kwon committed
68
                self.blocks_to_copy[src_block] = dst_block
Woosuk Kwon's avatar
Woosuk Kwon committed
69
70

    def _swap_in(self, seq_group: SequenceGroup) -> None:
Woosuk Kwon's avatar
Woosuk Kwon committed
71
72
        mapping = self.block_manager.swap_in(seq_group)
        self.blocks_to_swap_in.update(mapping)
Woosuk Kwon's avatar
Woosuk Kwon committed
73
74
75
76
77
78
79
        for seq in seq_group.seqs:
            if seq.status == SequenceStatus.SWAPPED:
                seq.status = SequenceStatus.RUNNING
        self.serving.append(seq_group)

    def _swap_out(self, seq_group: SequenceGroup) -> None:
        assert self.block_manager.can_swap_out(seq_group)
Woosuk Kwon's avatar
Woosuk Kwon committed
80
81
        mapping = self.block_manager.swap_out(seq_group)
        self.blocks_to_swap_out.update(mapping)
Woosuk Kwon's avatar
Woosuk Kwon committed
82
83
84
85
86
        for seq in seq_group.seqs:
            if seq.status == SequenceStatus.RUNNING:
                seq.status = SequenceStatus.SWAPPED
        self.swapped.append(seq_group)

Woosuk Kwon's avatar
Woosuk Kwon committed
87
    def prepare(self) -> None:
Woosuk Kwon's avatar
Woosuk Kwon committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
        # 1. Prepare new slots for the running sequences.
        # NOTE: Here we implicitly assume FCFS scheduling.
        # That is, the most recently added sequence group is the first
        # to be swapped out.
        victim_idx = len(self.serving) - 1
        for i, seq_group in enumerate(self.serving):
            if i > victim_idx:
                # The i-th sequence group has already been swapped out.
                break
            # OOM. Swap out the victim sequence groups.
            while not self.block_manager.can_append(seq_group):
                victim_seq_group = self.serving[victim_idx]
                self._swap_out(victim_seq_group)
                victim_idx -= 1
                if i > victim_idx:
                    # No other sequence groups can be swapped out.
                    break
            else:
                self._append(seq_group)
        self.serving = self.serving[:victim_idx + 1]

        # 2. Swap in the swapped sequences if possible.
        # NOTE: Here we implicitly assume FCFS scheduling.
        # The swapped sequences are in LIFO order.
        for i, seq_group in enumerate(reversed(self.swapped)):
            if self.block_manager.can_swap_in(seq_group):
                self._swap_in(seq_group)
Woosuk Kwon's avatar
Woosuk Kwon committed
115
                self._append(seq_group)
Woosuk Kwon's avatar
Woosuk Kwon committed
116
117
118
119
120
121
122
123
124
125
126
127
            else:
                # OOM. Stop swapping.
                self.swapped = self.swapped[:len(self.swapped) - i]
                break
        else:
            # All swapped sequences are swapped in.
            self.swapped.clear()

        # 3. Join new sequences if possible.
        # NOTE: Here we implicitly assume FCFS scheduling.
        # TODO(woosuk): Add a heuristic to control the maximum batch size.
        if not self.swapped:
Woosuk Kwon's avatar
Woosuk Kwon committed
128
            # FIXME: Acquire a lock.
Woosuk Kwon's avatar
Woosuk Kwon committed
129
            for i, seq_group in enumerate(self.pending):
Woosuk Kwon's avatar
Woosuk Kwon committed
130
131
132
133
                if self.block_manager.can_allocate(seq_group):
                    self._allocate(seq_group)
                else:
                    # FIXME: Consider the race condition.
Woosuk Kwon's avatar
Woosuk Kwon committed
134
                    self.pending = self.pending[i:]
Woosuk Kwon's avatar
Woosuk Kwon committed
135
                    break
Woosuk Kwon's avatar
Woosuk Kwon committed
136
137
            else:
                self.pending.clear()
Woosuk Kwon's avatar
Woosuk Kwon committed
138

Woosuk Kwon's avatar
Woosuk Kwon committed
139
140
    def step(self) -> None:
        # Ensure that either swap-in or swap-out is performed.
Woosuk Kwon's avatar
Woosuk Kwon committed
141
142
        if self.blocks_to_swap_in:
            assert not self.blocks_to_swap_out
Woosuk Kwon's avatar
Woosuk Kwon committed
143
144
145
146
147
148
149

        # Execute the first stage of the pipeline.
        self.controllers[0].execute_stage(
            self.blocks_to_swap_in.copy(),
            self.blocks_to_swap_out.copy(),
            self.blocks_to_copy.copy(),
        )
Woosuk Kwon's avatar
Woosuk Kwon committed
150

Woosuk Kwon's avatar
Woosuk Kwon committed
151
152
153
154
155
        # Clear for the next step.
        self.blocks_to_swap_in.clear()
        self.blocks_to_swap_out.clear()
        self.blocks_to_copy.clear()

Woosuk Kwon's avatar
Woosuk Kwon committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    def post_step(
        self,
        next_tokens: Dict[int, Tuple[int, int]],
    ) -> None:
        # Update the running sequences and free blocks.
        for seq_group in self.serving:
            group_id = seq_group.group_id
            self.num_steps[group_id] += 1
            stop_token_ids = self.stop_token_ids[group_id]

            for seq in seq_group.seqs:
                if seq.status == SequenceStatus.FINISHED:
                    continue

                parent_seq_id, next_token = next_tokens[seq.seq_id]
                if seq.seq_id != parent_seq_id:
                    # The sequence is a fork of the parent sequence (beam search).
                    # Free the current sequence.
                    self.block_manager.free(seq)
                    # Fork the parent sequence.
                    parent_seq = seq_group.find(parent_seq_id)
                    seq.logical_token_blocks = parent_seq.logical_token_blocks.copy()
                    self.block_manager.fork(parent_seq, seq)

                # Append a new token to the sequence.
                seq.append(next_token)

                # Check if the sequence has generated a stop token.
                if next_token in stop_token_ids:
                    self._free_seq(seq)
                    continue

                # Check if the sequence has reached the maximum number of steps.
                if self.num_steps[group_id] == self.max_num_steps[group_id]:
                    self._free_seq(seq)
                    continue

        # Update the serving states.
        serving: List[SequenceGroup] = []
        for seq_group in self.serving:
Woosuk Kwon's avatar
Woosuk Kwon committed
196
            if all(seq.status == SequenceStatus.FINISHED for seq in seq_group.seqs):
Woosuk Kwon's avatar
Woosuk Kwon committed
197
198
199
200
201
202
203
                del self.num_steps[seq_group.group_id]
                del self.max_num_steps[seq_group.group_id]
                del self.stop_token_ids[seq_group.group_id]
                # TODO: Return the seq_group to the client.
            else:
                serving.append(seq_group)
        self.serving = serving