"examples/backends/vllm/launch/agg_omni_audio.sh" did not exist on "75bf1e09930681c76586f281ccff6159a0e50449"
outputs.py 4.43 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
from abc import ABC, abstractmethod
5
from dataclasses import dataclass
6
from typing import NamedTuple, Optional
7
8
9
10

import torch


11
class LogprobsLists(NamedTuple):
12

13
    # [num_reqs, max_num_logprobs + 1]
14
    logprob_token_ids: list[list[int]]
15
    # [num_reqs, max_num_logprobs + 1]
16
    logprobs: list[list[float]]
17
    # [num_reqs]
18
    sampled_token_ranks: list[int]
19
20
21
22
23
24
25
26
27
28

    def slice(self, start: int, end: int):
        return LogprobsLists(
            self.logprob_token_ids[start:end],
            self.logprobs[start:end],
            self.sampled_token_ranks[start:end],
        )


class LogprobsTensors(NamedTuple):
29
30

    # [num_reqs, max_num_logprobs + 1]
31
    logprob_token_ids: torch.Tensor
32
    # [num_reqs, max_num_logprobs + 1]
33
34
35
    logprobs: torch.Tensor
    # [num_reqs]
    selected_token_ranks: torch.Tensor
36

37
38
39
40
41
42
43
    def tolists(self):
        return LogprobsLists(
            self.logprob_token_ids.tolist(),
            self.logprobs.tolist(),
            self.selected_token_ranks.tolist(),
        )

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    @staticmethod
    def empty_cpu(num_positions: int,
                  num_tokens_per_position: int) -> "LogprobsTensors":
        """Create empty LogprobsTensors on CPU."""

        logprob_token_ids = torch.empty(
            (num_positions, num_tokens_per_position),
            dtype=torch.int32,
            device="cpu")
        logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
        selected_token_ranks = torch.empty(num_positions,
                                           dtype=torch.int32,
                                           device="cpu")
        return LogprobsTensors(
            logprob_token_ids=logprob_token_ids,
            logprobs=logprobs,
            selected_token_ranks=selected_token_ranks,
        )

63
64
65
66

@dataclass
class SamplerOutput:

67
68
69
    # [num_reqs, max_num_generated_tokens]
    # Different requests can have different number of generated tokens.
    # All requests are padded to max_num_generated_tokens.
70
    # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
71
72
    sampled_token_ids: torch.Tensor
    logprobs_tensors: Optional[LogprobsTensors]
73
74


75
76
77
78
79
80
81
@dataclass
class KVConnectorOutput:
    # [req_ids]
    finished_sending: Optional[set[str]] = None
    finished_recving: Optional[set[str]] = None


82
# ModelRunnerOutput is serialized and sent to the scheduler process.
83
# This is expensive for torch.Tensor so prefer to use list instead.
84
85
86
87
@dataclass
class ModelRunnerOutput:

    # [num_reqs]
88
    req_ids: list[str]
89
    # req_id -> index
90
    req_id_to_index: dict[str, int]
91

92
93
94
95
    # num_reqs x num_generated_tokens
    # num_generated_tokens is the number of tokens
    # generated in the current step. It can be different for
    # each request due to speculative/jump decoding.
96
    sampled_token_ids: list[list[int]]
97
98
99

    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs, max_num_logprobs + 1]
100
101
102
103
104
105
106
    # [num_reqs]
    logprobs: Optional[LogprobsLists]

    # req_id -> (token_ids, logprobs, ranks)
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len]
107
    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
108

109
110
111
    # [num_reqs, hidden_size]
    pooler_output: list[Optional[torch.Tensor]]

112
    kv_connector_output: Optional[KVConnectorOutput] = None
Robert Shaw's avatar
Robert Shaw committed
113

114
115
116
    # req_id -> num_nans_in_logits
    num_nans_in_logits: Optional[dict[str, int]] = None

Robert Shaw's avatar
Robert Shaw committed
117

118
119
120
121
122
123
124
125
126
127
128
129
130
131
# ModelRunnerOutput wrapper for async scheduling.
class AsyncModelRunnerOutput(ABC):

    @abstractmethod
    def get_output(self) -> ModelRunnerOutput:
        """Get the ModelRunnerOutput for this async output.
        
        This is a blocking call that waits until the results are ready, which
        might involve copying device tensors to the host.
        This method should only be called once per AsyncModelRunnerOutput.
        """
        pass


132
133
134
135
136
137
138
139
140
@dataclass
class DraftTokenIds:

    # [num_reqs]
    req_ids: list[str]
    # num_reqs x num_draft_tokens
    draft_token_ids: list[list[int]]


Robert Shaw's avatar
Robert Shaw committed
141
142
143
144
145
EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
                                              req_id_to_index={},
                                              sampled_token_ids=[],
                                              logprobs=None,
                                              prompt_logprobs_dict={},
146
                                              pooler_output=[],
147
                                              num_nans_in_logits=None)