__init__.py 4.32 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import enum
4
import time
5
from typing import Any, Optional, Union
6
7
8

import msgspec

9
10
11
12
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.inputs import PlaceholderRange
from vllm.sampling_params import SamplingParams
13
from vllm.v1.metrics.stats import SchedulerStats
14
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
15

16
17
18
# These are possible values of RequestOutput.finish_reason,
# so form part of the external API.
FINISH_REASON_STRINGS = ("stop", "length", "abort")
19

20
21

class FinishReason(enum.IntEnum):
22
23
24
    """
    Reason a request finished - stop, length, or abort.

25
26
    Int rather than Str for more compact serialization.

27
28
29
30
31
32
33
34
35
36
    stop - a stop string was emitted
    length - max_tokens was consumed, or max_model_len was reached
    abort - aborted for another reason

    """
    STOP = 0
    LENGTH = 1
    ABORT = 2

    def __str__(self):
37
        return FINISH_REASON_STRINGS[self.value]
38
39


40
41
42
43
44
class EngineCoreRequest(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]
45
46
47
48
49
50

    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
    # but this object is currently not playing well with msgspec
    # due to circular imports and typing we have in data.py

    request_id: str
51
52
    # NOTE(ywang96): original text prompt is needed when a request is added to
    # Detokenizer, but set to None when it is added to EngineCoreClient.
53
    prompt: Optional[str]
54
    prompt_token_ids: list[int]
55
    mm_inputs: Optional[list[MultiModalKwargs]]
56
57
    mm_hashes: Optional[list[str]]
    mm_placeholders: Optional[list[PlaceholderRange]]
58
    sampling_params: SamplingParams
59
60
    eos_token_id: Optional[int]
    arrival_time: float
61
    lora_request: Optional[LoRARequest]
62
63


64
65
66
67
class EngineCoreEventType(enum.IntEnum):
    """The type of engine core request event."""
    QUEUED = 1
    SCHEDULED = 2
68
    PREEMPTED = 3
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88


class EngineCoreEvent(msgspec.Struct):
    """A timestamped engine core event associated with a request.

    The timestamp is a monotonic timestamps and is used for by the engine
    frontend to calculate intervals between engine core events. These
    timestamps should not be compared with timestamps from other processes.
    """
    type: EngineCoreEventType
    timestamp: float

    @classmethod
    def new_event(cls,
                  event_type: EngineCoreEventType,
                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
        timestamp = time.monotonic() if timestamp is None else timestamp
        return cls(event_type, timestamp)


89
90
91
92
93
class EngineCoreOutput(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]
94
95

    request_id: str
96
    new_token_ids: list[int]
97
98
99
100

    new_logprobs: Optional[LogprobsLists] = None
    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None

101
    finish_reason: Optional[FinishReason] = None
102
    stop_reason: Union[int, str, None] = None
103
    events: Optional[list[EngineCoreEvent]] = None
104

105
106
107
108
    @property
    def finished(self) -> bool:
        return self.finish_reason is not None

109

110
111
112
113
114
115
116
117
118
119
120
121
class UtilityOutput(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]

    call_id: int

    # Non-None implies the call failed, result should be None.
    failure_message: Optional[str] = None
    result: Any = None


122
123
124
125
126
class EngineCoreOutputs(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]
127
128

    #NOTE(Nick): We could consider ways to make this more compact,
129
    # e.g. columnwise layout
130
131

    # [num_reqs]
132
    outputs: list[EngineCoreOutput] = []
133
    scheduler_stats: Optional[SchedulerStats] = None
134
135
    timestamp: float = 0.0

136
137
    utility_output: Optional[UtilityOutput] = None

138
139
140
    def __post_init__(self):
        if self.timestamp == 0.0:
            self.timestamp = time.monotonic()
141
142
143
144
145
146
147
148
149


class EngineCoreRequestType(enum.Enum):
    """
    Request types defined as hex byte strings, so it can be sent over sockets
    without separate encoding step.
    """
    ADD = b'\x00'
    ABORT = b'\x01'
150
    UTILITY = b'\x02'