"vllm/entrypoints/serve/disagg/serving.py" did not exist on "6f1e7f7226447f606a0731376a2d0bd080aa2767"
__init__.py 3.02 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import enum
4
from typing import List, Optional, Union
5
6
7

import msgspec

8
9
10
11
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.inputs import PlaceholderRange
from vllm.sampling_params import SamplingParams
12
from vllm.v1.metrics.stats import SchedulerStats
13
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
14

15
16
17
# These are possible values of RequestOutput.finish_reason,
# so form part of the external API.
FINISH_REASON_STRINGS = ("stop", "length", "abort")
18

19
20

class FinishReason(enum.IntEnum):
21
22
23
    """
    Reason a request finished - stop, length, or abort.

24
25
    Int rather than Str for more compact serialization.

26
27
28
29
30
31
32
33
34
35
    stop - a stop string was emitted
    length - max_tokens was consumed, or max_model_len was reached
    abort - aborted for another reason

    """
    STOP = 0
    LENGTH = 1
    ABORT = 2

    def __str__(self):
36
        return FINISH_REASON_STRINGS[self.value]
37
38


39
40
41
42
43
class EngineCoreRequest(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]
44
45
46
47
48
49

    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
    # but this object is currently not playing well with msgspec
    # due to circular imports and typing we have in data.py

    request_id: str
50
51
    # NOTE(ywang96): original text prompt is needed when a request is added to
    # Detokenizer, but set to None when it is added to EngineCoreClient.
52
53
    prompt: Optional[str]
    prompt_token_ids: List[int]
54
    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
55
    mm_hashes: Optional[List[str]]
56
57
    mm_placeholders: Optional[List[PlaceholderRange]]
    sampling_params: SamplingParams
58
59
    eos_token_id: Optional[int]
    arrival_time: float
60
    lora_request: Optional[LoRARequest]
61
62


63
64
65
66
67
class EngineCoreOutput(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]
68
69
70

    request_id: str
    new_token_ids: List[int]
71
72
73
74

    new_logprobs: Optional[LogprobsLists] = None
    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None

75
    finish_reason: Optional[FinishReason] = None
76
77
    stop_reason: Union[int, str, None] = None

78
79
80
81
    @property
    def finished(self) -> bool:
        return self.finish_reason is not None

82

83
84
85
86
87
class EngineCoreOutputs(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]
88
89

    #NOTE(Nick): We could consider ways to make this more compact,
90
    # e.g. columnwise layout
91
92
93

    # [num_reqs]
    outputs: List[EngineCoreOutput]
94
    scheduler_stats: SchedulerStats
95
96
97
98
99
100
101
102
103


class EngineCoreRequestType(enum.Enum):
    """
    Request types defined as hex byte strings, so it can be sent over sockets
    without separate encoding step.
    """
    ADD = b'\x00'
    ABORT = b'\x01'
104
    PROFILE = b'\x02'
105
    RESET_PREFIX_CACHE = b'\x03'