sequence.py 15.2 KB
Newer Older
1
"""Sequence and its related classes."""
2
import copy
Woosuk Kwon's avatar
Woosuk Kwon committed
3
import enum
Zhuohan Li's avatar
Zhuohan Li committed
4
from typing import Dict, List, Optional, Union
Woosuk Kwon's avatar
Woosuk Kwon committed
5

Woosuk Kwon's avatar
Woosuk Kwon committed
6
from vllm.block import LogicalTokenBlock
7
from vllm.prefix import Prefix
Woosuk Kwon's avatar
Woosuk Kwon committed
8
from vllm.sampling_params import SamplingParams
9
from vllm.lora.request import LoRARequest
Woosuk Kwon's avatar
Woosuk Kwon committed
10

11
12
13
PromptLogprobs = List[Optional[Dict[int, float]]]
SampleLogprobs = List[Dict[int, float]]

Woosuk Kwon's avatar
Woosuk Kwon committed
14
15

class SequenceStatus(enum.Enum):
16
    """Status of a sequence."""
17
    WAITING = enum.auto()
Woosuk Kwon's avatar
Woosuk Kwon committed
18
    RUNNING = enum.auto()
Woosuk Kwon's avatar
Woosuk Kwon committed
19
    SWAPPED = enum.auto()
Zhuohan Li's avatar
Zhuohan Li committed
20
21
    FINISHED_STOPPED = enum.auto()
    FINISHED_LENGTH_CAPPED = enum.auto()
22
    FINISHED_ABORTED = enum.auto()
Lily Liu's avatar
Lily Liu committed
23
    FINISHED_IGNORED = enum.auto()
Zhuohan Li's avatar
Zhuohan Li committed
24
25
26
27
28
29

    @staticmethod
    def is_finished(status: "SequenceStatus") -> bool:
        return status in [
            SequenceStatus.FINISHED_STOPPED,
            SequenceStatus.FINISHED_LENGTH_CAPPED,
30
            SequenceStatus.FINISHED_ABORTED,
31
            SequenceStatus.FINISHED_IGNORED,
Zhuohan Li's avatar
Zhuohan Li committed
32
33
34
35
36
37
38
39
        ]

    @staticmethod
    def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
        if status == SequenceStatus.FINISHED_STOPPED:
            finish_reason = "stop"
        elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
            finish_reason = "length"
40
41
        elif status == SequenceStatus.FINISHED_ABORTED:
            finish_reason = "abort"
Lily Liu's avatar
Lily Liu committed
42
        elif status == SequenceStatus.FINISHED_IGNORED:
43
44
45
            # The ignored sequences are the sequences whose prompt lengths
            # are longer than the model's length cap. Therefore, the stop
            # reason should also be "length" as in OpenAI API.
Lily Liu's avatar
Lily Liu committed
46
            finish_reason = "length"
Zhuohan Li's avatar
Zhuohan Li committed
47
48
49
        else:
            finish_reason = None
        return finish_reason
Woosuk Kwon's avatar
Woosuk Kwon committed
50

51

52
class SequenceData:
53
54
55
56
57
58
59
60
61
62
    """Data associated with a sequence.

    Args:
        prompt_token_ids: The token IDs of the prompt.

    Attributes:
        prompt_token_ids: The token IDs of the prompt.
        output_token_ids: The token IDs of the output.
        cumulative_logprob: The cumulative log probability of the output.
    """
63
64
65
66
67
68
69

    def __init__(
        self,
        prompt_token_ids: List[int],
    ) -> None:
        self.prompt_token_ids = prompt_token_ids
        self.output_token_ids: List[int] = []
70
71
        self.cumulative_logprob = 0.0

72
    def append_token_id(self, token_id: int, logprob: float) -> None:
73
74
        self.output_token_ids.append(token_id)
        self.cumulative_logprob += logprob
75
76
77
78

    def get_len(self) -> int:
        return len(self.output_token_ids) + len(self.prompt_token_ids)

79
80
81
    def get_prompt_len(self) -> int:
        return len(self.prompt_token_ids)

82
83
84
    def get_output_len(self) -> int:
        return len(self.output_token_ids)

85
86
87
88
89
90
91
92
93
94
95
    def get_token_ids(self) -> List[int]:
        return self.prompt_token_ids + self.output_token_ids

    def get_last_token_id(self) -> int:
        if not self.output_token_ids:
            return self.prompt_token_ids[-1]
        return self.output_token_ids[-1]

    def __repr__(self) -> str:
        return (f"SequenceData("
                f"prompt_token_ids={self.prompt_token_ids}, "
96
97
                f"output_token_ids={self.output_token_ids}, "
                f"cumulative_logprob={self.cumulative_logprob})")
98
99


Woosuk Kwon's avatar
Woosuk Kwon committed
100
class Sequence:
101
102
103
104
105
106
107
108
    """Stores the data, status, and block information of a sequence.

    Args:
        seq_id: The ID of the sequence.
        prompt: The prompt of the sequence.
        prompt_token_ids: The token IDs of the prompt.
        block_size: The block size of the sequence. Should be the same as the
            block size used by the block manager and cache engine.
109
        lora_request: LoRA request.
110
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
111
112
113
114

    def __init__(
        self,
        seq_id: int,
115
        prompt: str,
116
        prompt_token_ids: List[int],
Woosuk Kwon's avatar
Woosuk Kwon committed
117
        block_size: int,
118
        lora_request: Optional[LoRARequest] = None,
Woosuk Kwon's avatar
Woosuk Kwon committed
119
120
    ) -> None:
        self.seq_id = seq_id
121
        self.prompt = prompt
Woosuk Kwon's avatar
Woosuk Kwon committed
122
        self.block_size = block_size
123
        self.lora_request = lora_request
Woosuk Kwon's avatar
Woosuk Kwon committed
124

125
        self.data = SequenceData(prompt_token_ids)
126
        self.output_logprobs: SampleLogprobs = []
127
        self.output_text = ""
128

Woosuk Kwon's avatar
Woosuk Kwon committed
129
        self.logical_token_blocks: List[LogicalTokenBlock] = []
130
        # Initialize the logical token blocks with the prompt token ids.
131
        self._append_tokens_to_blocks(prompt_token_ids)
132
        self.status = SequenceStatus.WAITING
Woosuk Kwon's avatar
Woosuk Kwon committed
133

134
135
136
137
138
139
        # Used for incremental detokenization
        self.prefix_offset = 0
        self.read_offset = 0
        # Input + output tokens
        self.tokens: Optional[List[str]] = None

140
141
142
143
    @property
    def lora_int_id(self) -> int:
        return self.lora_request.lora_int_id if self.lora_request else 0

144
    def _append_logical_block(self) -> None:
Woosuk Kwon's avatar
Woosuk Kwon committed
145
146
147
148
149
150
        block = LogicalTokenBlock(
            block_number=len(self.logical_token_blocks),
            block_size=self.block_size,
        )
        self.logical_token_blocks.append(block)

151
    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
152
153
        cursor = 0
        while cursor < len(token_ids):
Woosuk Kwon's avatar
Woosuk Kwon committed
154
            if not self.logical_token_blocks:
155
                self._append_logical_block()
Woosuk Kwon's avatar
Woosuk Kwon committed
156
157
158

            last_block = self.logical_token_blocks[-1]
            if last_block.is_full():
159
                self._append_logical_block()
Woosuk Kwon's avatar
Woosuk Kwon committed
160
161
162
                last_block = self.logical_token_blocks[-1]

            num_empty_slots = last_block.get_num_empty_slots()
163
164
165
            last_block.append_tokens(token_ids[cursor:cursor +
                                               num_empty_slots])
            cursor += num_empty_slots
Woosuk Kwon's avatar
Woosuk Kwon committed
166

167
168
169
170
171
    def append_token_id(
        self,
        token_id: int,
        logprobs: Dict[int, float],
    ) -> None:
172
        assert token_id in logprobs
173
        self._append_tokens_to_blocks([token_id])
174
        self.output_logprobs.append(logprobs)
175
        self.data.append_token_id(token_id, logprobs[token_id])
176

Woosuk Kwon's avatar
Woosuk Kwon committed
177
    def get_len(self) -> int:
178
        return self.data.get_len()
Woosuk Kwon's avatar
Woosuk Kwon committed
179

180
181
182
    def get_prompt_len(self) -> int:
        return self.data.get_prompt_len()

183
184
185
    def get_output_len(self) -> int:
        return self.data.get_output_len()

Woosuk Kwon's avatar
Woosuk Kwon committed
186
    def get_token_ids(self) -> List[int]:
187
        return self.data.get_token_ids()
Woosuk Kwon's avatar
Woosuk Kwon committed
188

189
    def get_last_token_id(self) -> int:
190
        return self.data.get_last_token_id()
191

192
193
194
195
196
197
    def get_output_token_ids(self) -> List[int]:
        return self.data.output_token_ids

    def get_cumulative_logprob(self) -> float:
        return self.data.cumulative_logprob

198
    def get_beam_search_score(self,
199
                              length_penalty: float = 1.0,
200
201
202
203
204
205
206
207
208
209
                              seq_len: Optional[int] = None,
                              eos_token_id: Optional[int] = None) -> float:
        """Calculate the beam search score with length penalty.

        Adapted from

        https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
        """
        if seq_len is None:
            seq_len = self.get_len()
210
            # NOTE: HF implementation does not count the EOS token
211
212
213
214
215
216
            # towards the length, we align with that here for testing.
            if (eos_token_id is not None
                    and self.get_last_token_id() == eos_token_id):
                seq_len -= 1
        return self.get_cumulative_logprob() / (seq_len**length_penalty)

217
218
219
    def is_finished(self) -> bool:
        return SequenceStatus.is_finished(self.status)

220
221
222
223
    def fork(self, new_seq_id: int) -> "Sequence":
        new_seq = copy.deepcopy(self)
        new_seq.seq_id = new_seq_id
        return new_seq
224

Woosuk Kwon's avatar
Woosuk Kwon committed
225
    def __repr__(self) -> str:
226
227
228
        return (f"Sequence(seq_id={self.seq_id}, "
                f"status={self.status.name}, "
                f"num_blocks={len(self.logical_token_blocks)})")
Woosuk Kwon's avatar
Woosuk Kwon committed
229

Woosuk Kwon's avatar
Woosuk Kwon committed
230
231

class SequenceGroup:
232
233
234
235
236
237
238
    """A group of sequences that are generated from the same prompt.

    Args:
        request_id: The ID of the request.
        seqs: The list of sequences.
        sampling_params: The sampling parameters used to generate the outputs.
        arrival_time: The arrival time of the request.
239
        lora_request: LoRA request.
zspo's avatar
zspo committed
240
        prefix: The prefix of the prompt of the sequence group.
241
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
242
243
244

    def __init__(
        self,
245
        request_id: str,
Woosuk Kwon's avatar
Woosuk Kwon committed
246
        seqs: List[Sequence],
247
        sampling_params: SamplingParams,
248
        arrival_time: float,
249
        lora_request: Optional[LoRARequest] = None,
250
        prefix: Optional[Prefix] = None,
Woosuk Kwon's avatar
Woosuk Kwon committed
251
    ) -> None:
252
        self.request_id = request_id
253
        self.seqs_dict = {seq.seq_id: seq for seq in seqs}
254
        self.sampling_params = sampling_params
255
        self.arrival_time = arrival_time
256
        self.last_token_time = arrival_time
257
        self.lora_request = lora_request
258
        self.prefix: Optional[Prefix] = prefix
259
260
261
262
263
264
265
266
267
268
269
270
271
        self.prompt_logprobs: Optional[PromptLogprobs] = None

    @property
    def prompt(self) -> str:
        # All sequences in the group should have the same prompt.
        # We use the prompt of an arbitrary sequence.
        return next(iter(self.seqs_dict.values())).prompt

    @property
    def prompt_token_ids(self) -> List[int]:
        # All sequences in the group should have the same prompt.
        # We use the prompt of an arbitrary sequence.
        return next(iter(self.seqs_dict.values())).data.prompt_token_ids
Woosuk Kwon's avatar
Woosuk Kwon committed
272

273
274
275
276
    @property
    def lora_int_id(self) -> int:
        return self.lora_request.lora_int_id if self.lora_request else 0

277
278
279
280
281
282
    def get_last_latency(self, now: float) -> float:
        """Gets last token latency for Request level timings."""
        latency = now - self.last_token_time
        self.last_token_time = now
        return latency

283
284
285
286
287
288
289
290
291
292
293
294
295
296
    def get_max_num_running_seqs(self) -> int:
        """The maximum number of sequences running in parallel in the remaining
        lifetime of the request."""
        if self.sampling_params.use_beam_search:
            # For beam search, maximally there will always be `best_of` beam
            # candidates running in the future.
            return self.sampling_params.best_of
        else:
            if self.sampling_params.best_of > self.num_seqs():
                # At prompt stage, the sequence group is not yet filled up
                # and only have one sequence running. However, in the
                # generation stage, we will have `best_of` sequences running.
                return self.sampling_params.best_of
            # At sampling stages, return the number of actual sequences
297
298
            # that are not finished yet.
            return self.num_unfinished_seqs()
299

300
301
302
303
    def get_seqs(
        self,
        status: Optional[SequenceStatus] = None,
    ) -> List[Sequence]:
Woosuk Kwon's avatar
Woosuk Kwon committed
304
        if status is None:
305
            return list(self.seqs_dict.values())
Woosuk Kwon's avatar
Woosuk Kwon committed
306
        else:
307
308
309
310
            return [
                seq for seq in self.seqs_dict.values() if seq.status == status
            ]

311
312
313
314
315
    def get_unfinished_seqs(self) -> List[Sequence]:
        return [
            seq for seq in self.seqs_dict.values() if not seq.is_finished()
        ]

316
317
    def get_finished_seqs(self) -> List[Sequence]:
        return [seq for seq in self.seqs_dict.values() if seq.is_finished()]
318
319
320

    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
        return len(self.get_seqs(status))
321

322
323
324
325
326
327
    def num_unfinished_seqs(self) -> int:
        return len(self.get_unfinished_seqs())

    def num_finished_seqs(self) -> int:
        return len(self.get_finished_seqs())

328
    def find(self, seq_id: int) -> Sequence:
329
330
331
332
333
334
335
336
337
338
339
340
341
        if seq_id not in self.seqs_dict:
            raise ValueError(f"Sequence {seq_id} not found.")
        return self.seqs_dict[seq_id]

    def add(self, seq: Sequence) -> None:
        if seq.seq_id in self.seqs_dict:
            raise ValueError(f"Sequence {seq.seq_id} already exists.")
        self.seqs_dict[seq.seq_id] = seq

    def remove(self, seq_id: int) -> None:
        if seq_id not in self.seqs_dict:
            raise ValueError(f"Sequence {seq_id} not found.")
        del self.seqs_dict[seq_id]
Woosuk Kwon's avatar
Woosuk Kwon committed
342

Woosuk Kwon's avatar
Woosuk Kwon committed
343
    def is_finished(self) -> bool:
344
        return all(seq.is_finished() for seq in self.get_seqs())
Woosuk Kwon's avatar
Woosuk Kwon committed
345

Woosuk Kwon's avatar
Woosuk Kwon committed
346
    def __repr__(self) -> str:
347
348
        return (f"SequenceGroup(request_id={self.request_id}, "
                f"sampling_params={self.sampling_params}, "
349
                f"num_seqs={len(self.seqs_dict)})")
350
351


352
class SequenceGroupMetadata:
353
354
355
356
357
358
359
360
361
    """Metadata for a sequence group. Used to create `InputMetadata`.

    Args:
        request_id: The ID of the request.
        is_prompt: Whether the request is at prompt stage.
        seq_data: The sequence data. (Seq id -> sequence data)
        sampling_params: The sampling parameters used to generate the outputs.
        block_tables: The block tables. (Seq id -> list of physical block
            numbers)
362
        lora_request: LoRA request.
363
        prefix: The prefix of the prompt of the sequence group.
364
    """
365
366
367

    def __init__(
        self,
368
        request_id: str,
369
        is_prompt: bool,
370
        seq_data: Dict[int, SequenceData],
371
        sampling_params: SamplingParams,
372
        block_tables: Dict[int, List[int]],
373
        lora_request: Optional[LoRARequest] = None,
374
        prefix: Optional[Prefix] = None,
375
    ) -> None:
376
        self.request_id = request_id
377
        self.is_prompt = is_prompt
378
        self.seq_data = seq_data
379
380
        self.sampling_params = sampling_params
        self.block_tables = block_tables
381
        self.lora_request = lora_request
382
        self.prefix = prefix
383

384
385
386
387
    @property
    def lora_int_id(self) -> int:
        return self.lora_request.lora_int_id if self.lora_request else 0

388

Zhuohan Li's avatar
Zhuohan Li committed
389
class SequenceOutput:
390
391
392
393
394
395
396
397
398
    """The model output associated with a sequence.

    Args:
        parent_seq_id: The ID of the parent sequence (for forking in beam
            search).
        output_token: The output token ID.
        logprobs: The logprobs of the output token.
            (Token id -> logP(x_i+1 | x_0, ..., x_i))
    """
399
400
401
402
403

    def __init__(
        self,
        parent_seq_id: int,
        output_token: int,
404
        logprobs: Dict[int, float],
405
406
407
408
409
410
    ) -> None:
        self.parent_seq_id = parent_seq_id
        self.output_token = output_token
        self.logprobs = logprobs

    def __repr__(self) -> str:
Zhuohan Li's avatar
Zhuohan Li committed
411
        return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
412
413
                f"output_token={self.output_token}, "
                f"logprobs={self.logprobs})")
Zhuohan Li's avatar
Zhuohan Li committed
414

415
    def __eq__(self, other: object) -> bool:
Zhuohan Li's avatar
Zhuohan Li committed
416
        if not isinstance(other, SequenceOutput):
Zhuohan Li's avatar
Zhuohan Li committed
417
            raise NotImplementedError()
418
        return (self.parent_seq_id == other.parent_seq_id
419
420
                and self.output_token == other.output_token
                and self.logprobs == other.logprobs)
421
422


Zhuohan Li's avatar
Zhuohan Li committed
423
424
class SequenceGroupOutput:
    """The model output associated with a sequence group."""
425
426
427

    def __init__(
        self,
Zhuohan Li's avatar
Zhuohan Li committed
428
        samples: List[SequenceOutput],
429
430
431
432
433
434
        prompt_logprobs: Optional[PromptLogprobs],
    ) -> None:
        self.samples = samples
        self.prompt_logprobs = prompt_logprobs

    def __repr__(self) -> str:
Zhuohan Li's avatar
Zhuohan Li committed
435
        return (f"SequenceGroupOutput(samples={self.samples}, "
436
437
                f"prompt_logprobs={self.prompt_logprobs})")

438
    def __eq__(self, other: object) -> bool:
Zhuohan Li's avatar
Zhuohan Li committed
439
        if not isinstance(other, SequenceGroupOutput):
440
441
442
443
            raise NotImplementedError()
        return (self.samples == other.samples
                and self.prompt_logprobs == other.prompt_logprobs)

444

Zhuohan Li's avatar
Zhuohan Li committed
445
# For each sequence group, we generate a list of SequenceOutput object,
446
# each of which contains one possible candidate for the next token.
Zhuohan Li's avatar
Zhuohan Li committed
447
SamplerOutput = List[SequenceGroupOutput]