Unverified Commit 9ae2f603 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Various cleanups for MM input processing (#29970)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 80f8af4b
......@@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
??? code
```python
from vllm.utils.serial_utils import tensor2base64
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
buffer = io.BytesIO()
torch.save(image_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
base64_image_embedding = tensor2base64(image_embedding)
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
......
......@@ -28,13 +28,11 @@ Dependencies:
- openai
"""
import base64
import io
import torch
import transformers
from openai import OpenAI
from vllm.utils.serial_utils import tensor2base64
def main():
client = OpenAI(
......@@ -58,11 +56,7 @@ def main():
prompt_embeds = embedding_layer(token_ids).squeeze(0)
# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
encoded_embeds = tensor2base64(prompt_embeds)
completion = client.completions.create(
model=model_name,
......
......@@ -2,64 +2,47 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import numpy as np
import pytest
import requests
import torch
from ...utils import RemoteOpenAIServer
from vllm.utils.serial_utils import tensor2base64
MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
DTYPE = "float16"
from ...utils import RemoteOpenAIServer
def _terratorch_dummy_inputs(model_name: str):
def _terratorch_dummy_messages():
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
buffer_tiff = io.BytesIO()
torch.save(pixel_values, buffer_tiff)
buffer_tiff.seek(0)
binary_data = buffer_tiff.read()
base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
buffer_coord = io.BytesIO()
torch.save(location_coords, buffer_coord)
buffer_coord.seek(0)
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
return {
"model": model_name,
"additional_data": {"prompt_token_ids": [1]},
"encoding_format": "base64",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"pixel_values": base64_tensor_embedding,
"location_coords": base64_coord_embedding,
},
}
],
}
],
}
return [
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"pixel_values": tensor2base64(pixel_values),
"location_coords": tensor2base64(location_coords),
},
}
],
}
]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(model_name: str):
@pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
)
def test_single_request(model_name: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"float16",
"--enforce-eager",
"--trust-remote-code",
"--max-num-seqs",
......@@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
"--enable-mm-embeds",
]
with RemoteOpenAIServer(MODEL_NAME, args) as server:
prompt = _terratorch_dummy_inputs(model_name)
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
with RemoteOpenAIServer(model_name, args) as server:
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": _terratorch_dummy_messages(),
"encoding_format": "base64",
},
)
response.raise_for_status()
output = response.json()["data"][0]["data"]
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import json
import openai # use the official client for correctness check
......@@ -13,6 +11,7 @@ from transformers import AutoConfig
from tests.conftest import ImageTestAssets
from tests.utils import RemoteOpenAIServer
from vllm.utils.serial_utils import tensor2base64
# any model with a chat template should work here
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
......@@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
yield async_client
def encode_image_embedding_to_base64(image_embedding) -> str:
"""
Encode image embedding to base64 string
"""
buffer = io.BytesIO()
torch.save(image_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
return base64_image_embedding
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
......@@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
):
# Test case: Single image embeds input
image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
base64_image_embedding = tensor2base64(image_embeds)
chat_completion = await client_with_image_embeds.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
......
......@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
def _resolve_chat_template_content_format(
chat_template: str | None,
tools: list[dict[str, Any]] | None,
tokenizer: TokenizerLike,
tokenizer: TokenizerLike | None,
*,
model_config: ModelConfig,
) -> _ChatTemplateContentFormat:
......@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
chat_template: str | None,
tools: list[dict[str, Any]] | None,
given_format: ChatTemplateContentFormatOption,
tokenizer: TokenizerLike,
tokenizer: TokenizerLike | None,
*,
model_config: ModelConfig,
) -> _ChatTemplateContentFormat:
......@@ -627,11 +627,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
maximum per prompt.
"""
def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
def __init__(self, model_config: ModelConfig):
super().__init__()
self._model_config = model_config
self._tokenizer = tokenizer
self._items_by_modality = defaultdict[str, list[_T | None]](list)
self._uuids_by_modality = defaultdict[str, list[str | None]](list)
......@@ -1612,7 +1611,6 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
def parse_chat_messages(
messages: list[ChatCompletionMessageParam],
model_config: ModelConfig,
tokenizer: TokenizerLike,
content_format: _ChatTemplateContentFormat,
) -> tuple[
list[ConversationMessage],
......@@ -1620,7 +1618,7 @@ def parse_chat_messages(
MultiModalUUIDDict | None,
]:
conversation: list[ConversationMessage] = []
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
mm_tracker = MultiModalItemTracker(model_config)
for msg in messages:
sub_messages = _parse_chat_message_content(
......@@ -1644,7 +1642,6 @@ def parse_chat_messages(
def parse_chat_messages_futures(
messages: list[ChatCompletionMessageParam],
model_config: ModelConfig,
tokenizer: TokenizerLike,
content_format: _ChatTemplateContentFormat,
) -> tuple[
list[ConversationMessage],
......@@ -1652,7 +1649,7 @@ def parse_chat_messages_futures(
MultiModalUUIDDict | None,
]:
conversation: list[ConversationMessage] = []
mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
mm_tracker = AsyncMultiModalItemTracker(model_config)
for msg in messages:
sub_messages = _parse_chat_message_content(
......
......@@ -834,7 +834,6 @@ class LLM:
conversation, mm_data, mm_uuids = parse_chat_messages(
msgs,
model_config,
tokenizer,
content_format=resolved_content_format,
)
......
......@@ -1088,11 +1088,6 @@ class OpenAIServing:
Sequence[RequestPrompt],
list[EngineTokensPrompt],
]:
if tokenizer is None:
raise ValueError(
"Unable to get tokenizer because `skip_tokenizer_init=True`"
)
model_config = self.model_config
resolved_content_format = resolve_chat_template_content_format(
......@@ -1105,7 +1100,6 @@ class OpenAIServing:
conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
messages,
model_config,
tokenizer,
content_format=resolved_content_format,
)
......
......@@ -89,12 +89,10 @@ def parse_score_data(
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
model_config: ModelConfig,
tokenizer: TokenizerLike,
) -> tuple[str, str, MultiModalDataDict | None]:
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
mm_tracker = MultiModalItemTracker(model_config)
content_1 = _parse_score_content(data_1, mm_tracker)
content_2 = _parse_score_content(data_2, mm_tracker)
def ensure_str(content: _ContentPart | None) -> str:
......@@ -188,7 +186,6 @@ def get_score_prompt(
data_1,
data_2,
model_config,
tokenizer,
)
from vllm.model_executor.model_loader import get_model_cls
......
......@@ -62,6 +62,7 @@ from vllm.multimodal.inputs import (
from vllm.multimodal.parse import (
DictEmbeddingItems,
ImageSize,
ModalityDataItems,
MultiModalDataItems,
MultiModalDataParser,
)
......@@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: dict[str, torch.Tensor] | ModalityData[ImageItem],
):
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......
......@@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: dict[str, torch.Tensor] | ModalityData[ImageItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......@@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
def _parse_video_data(
self,
data: dict[str, torch.Tensor] | ModalityData[VideoItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......
......@@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: dict[str, torch.Tensor] | ModalityData[ImageItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......@@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
def _parse_video_data(
self,
data: dict[str, torch.Tensor] | ModalityData[VideoItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......
......@@ -11,6 +11,7 @@ import pybase64
import torch
from vllm.utils.import_utils import PlaceholderModule
from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO
......@@ -135,8 +136,4 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
return torch.load(filepath, weights_only=True)
def encode_base64(self, media: torch.Tensor) -> str:
buffer = BytesIO()
torch.save(media, buffer)
buffer.seek(0)
binary_data = buffer.read()
return pybase64.b64encode(binary_data).decode("utf-8")
return tensor2base64(media)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import sys
from dataclasses import dataclass
from typing import Literal
......@@ -52,6 +53,15 @@ Endianness = Literal["native", "big", "little"]
EncodingFormat = Literal["float", "base64", "bytes"]
def tensor2base64(x: torch.Tensor) -> str:
with io.BytesIO() as buf:
torch.save(x, buf)
buf.seek(0)
binary_data = buf.read()
return base64.b64encode(binary_data).decode("utf-8")
def tensor2binary(
tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness
) -> bytes:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment