"vllm/entrypoints/openai/engine/serving.py" did not exist on "5c057e068fc64214e34e8249ca302bb957c13305"
utils.py 9.06 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import mimetypes
5
6
from collections import defaultdict
from collections.abc import Generator, Sequence
7
from itertools import groupby
8
from typing import TYPE_CHECKING, Any
9

10
import numpy as np
11
import numpy.typing as npt
12
from PIL import Image
13

14
15
from vllm.utils.import_utils import LazyLoader

16
from .hasher import MultiModalHasher
17
18
from .inputs import (
    BatchedTensorInputs,
19
    MultiModalFieldElem,
20
21
    MultiModalKwargsItem,
    MultiModalPlaceholderDict,
22
    MultiModalSharedField,
23
)
24
from .media import AudioMediaIO, ImageMediaIO, MediaConnector, VideoMediaIO
25

26
if TYPE_CHECKING:
27
    import torch.types
28
else:
29
    torch = LazyLoader("torch", globals(), "torch")
30

31
32
33

def encode_audio_base64(
    audio: np.ndarray,
34
    sampling_rate: int,
35
36
    *,
    format: str = "WAV",
37
38
) -> str:
    """Encode audio as base64."""
39
    audio_io = AudioMediaIO()
40
41
42
43
44
45
46
47
48
49
50
51
52
    return audio_io.encode_base64((audio, sampling_rate), audio_format=format)


def encode_audio_url(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str:
    """Encode audio as a data URL."""
    audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
    return f"data:{mimetype};base64,{audio_b64}"
53
54


55
56
57
58
def encode_image_base64(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
59
    format: str = "PNG",
60
61
62
) -> str:
    """
    Encode a pillow image to base64 format.
63

64
65
    By default, the image is converted into RGB format before being encoded.
    """
66
67
    image_io = ImageMediaIO(image_mode=image_mode)
    return image_io.encode_base64(image, image_format=format)
68
69


70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def encode_image_url(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
    format: str = "PNG",
) -> str:
    """
    Encode a pillow image as a data URL.

    By default, the image is converted into RGB format before being encoded.
    """
    image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "image")
    return f"data:{mimetype};base64,{image_b64}"


def encode_video_base64(
    frames: npt.NDArray,
    *,
    format: str = "JPEG",
) -> str:
91
92
    image_io = ImageMediaIO()
    video_io = VideoMediaIO(image_io)
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    return video_io.encode_base64(frames, video_format=format)


def encode_video_url(
    frames: npt.NDArray,
    *,
    format: str = "JPEG",
) -> str:
    video_b64 = encode_video_base64(frames, format=format)

    if format.lower() == "jpeg":
        mimetype = "video/jpeg"
    else:
        mimetype = mimetypes.types_map.get("." + format.lower(), "video")

    return f"data:{mimetype};base64,{video_b64}"
109
110


111
def argsort_mm_positions(
112
113
    mm_positions: MultiModalPlaceholderDict,
) -> list[tuple[str, int]]:
114
115
116
117
    """
    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
    sort the dictionary by `offset` (starting index in the input sequence)
    in ascending order.
118
119

    Returns:
120
121
        A list of `(modality, idx)`, which can be used to access an item
        by `mm_positions[modality][idx]`.
122
    """
123
124
125
126
127
    flat_items = (
        (modality, idx, item)
        for modality, items in mm_positions.items()
        for idx, item in enumerate(items)
    )
128

129
    sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset)
130

131
    return [(modality, idx) for modality, idx, _ in sorted_flat_items]
132
133


134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def _get_group_hash(elem: MultiModalFieldElem):
    if not isinstance(elem.field, MultiModalSharedField):
        return None

    return MultiModalHasher.hash_kwargs(data=elem.data)


def _batch_mm_items(
    items: Sequence[MultiModalKwargsItem],
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
):
    elems = defaultdict[str, list[MultiModalFieldElem]](list)
    for item in items:
        for key, elem in item.items():
            elems[key].append(elem)

    return {
        key: elems[0].field.reduce_data(
            elems,
            device=device,
            pin_memory=pin_memory,
        )
        for key, elems in elems.items()
    }


def group_and_batch_mm_items(
    items: Sequence[MultiModalKwargsItem],
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
) -> Generator[tuple[int, BatchedTensorInputs]]:
    """
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    Args:
        items: List of `MultiModalKwargsItem`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(num_items, grouped_kwargs)`, where:
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    """
    group_ids = [
        tuple(
            (key, _get_group_hash(elem))
            for key, elem in sorted(item.items(), key=lambda kv: kv[0])
        )
        for item in items
    ]
    group_sizes = [sum(1 for _ in group) for _, group in groupby(group_ids)]

    start_idx = 0
    for group_size in group_sizes:
        group_data = _batch_mm_items(
            items[start_idx : start_idx + group_size],
            device=device,
            pin_memory=pin_memory,
        )

        yield group_size, group_data

        start_idx += group_size

    assert start_idx == len(items)


210
def group_mm_kwargs_by_modality(
211
    mm_kwargs: list[tuple[str, MultiModalKwargsItem]],
212
213
214
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
215
) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
216
217
218
219
220
221
222
223
224
225
    """
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    To simplify the implementation of `embed_multimodal`, we add another
    restriction that the items in a batch must belong to the same modality.
226
227

    Args:
228
        mm_kwargs: List of `(modality, item)`.
229
230
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.
231
232

    Yields:
233
234
235
236
        A tuple `(modality, num_items, grouped_kwargs)`, where:
        - `modality` is the modality of the batch;
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
237
    """
238
239
    for modality, group in groupby(mm_kwargs, key=lambda x: x[0]):
        items_lst = [item for _, item in group]
240
241
242

        for num_items, mm_kwargs_batch in group_and_batch_mm_items(
            items_lst,
243
244
            device=device,
            pin_memory=pin_memory,
245
246
        ):
            yield modality, num_items, mm_kwargs_batch
247
248


249
250
def fetch_audio(
    audio_url: str,
251
252
    audio_io_kwargs: dict[str, Any] | None = None,
) -> tuple[np.ndarray, int | float]:
253
254
255
256
    """
    Args:
        audio_url: URL of the audio file to fetch.
        audio_io_kwargs: Additional kwargs passed to handle audio IO.
257
258
259
260

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
261
    """
262
    media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs}
263
264
265
266
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
267
268
269
270
271
    return media_connector.fetch_audio(audio_url)


def fetch_image(
    image_url: str,
272
    image_io_kwargs: dict[str, Any] | None = None,
273
274
275
276
277
) -> Image.Image:
    """
    Args:
        image_url: URL of the image file to fetch.
        image_io_kwargs: Additional kwargs passed to handle image IO.
278
279
280
281

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
282
    """
283
    media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs}
284
285
286
287
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
288
289
290
291
292
    return media_connector.fetch_image(image_url)


def fetch_video(
    video_url: str,
293
    video_io_kwargs: dict[str, Any] | None = None,
294
295
296
297
298
) -> tuple[npt.NDArray, dict[str, Any]]:
    """
    Args:
        video_url: URL of the video file to fetch.
        video_io_kwargs: Additional kwargs passed to handle video IO.
299
300
301
302

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
303
    """
304
    media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs}
305
306
307
308
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
309
    return media_connector.fetch_video(video_url)