# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json import warnings from abc import ABC, abstractmethod from collections import Counter, defaultdict from collections.abc import Awaitable, Callable, Iterable from functools import cached_property, lru_cache, partial from pathlib import Path from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast from openai.types.chat import ( ChatCompletionAssistantMessageParam, ChatCompletionContentPartImageParam, ChatCompletionContentPartInputAudioParam, ChatCompletionContentPartRefusalParam, ChatCompletionContentPartTextParam, ChatCompletionFunctionToolParam, ChatCompletionMessageToolCallParam, ChatCompletionToolMessageParam, ) from openai.types.chat import ( ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam, ) from openai.types.chat import ( ChatCompletionMessageParam as OpenAIChatCompletionMessageParam, ) from openai.types.chat.chat_completion_content_part_input_audio_param import InputAudio from openai.types.responses import ResponseInputImageParam from openai_harmony import Message as OpenAIHarmonyMessage from PIL import Image from pydantic import BaseModel, ConfigDict, TypeAdapter # pydantic needs the TypedDict from typing_extensions from typing_extensions import Required, TypedDict from vllm import envs from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector from vllm.utils import random_uuid from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import LazyLoader if TYPE_CHECKING: import torch else: torch = LazyLoader("torch", globals(), "torch") logger = init_logger(__name__) def __getattr__(name: str): if name == "resolve_hf_chat_template": from vllm.renderers.hf import resolve_chat_template warnings.warn( "`vllm.entrypoints.chat_utils.resolve_hf_chat_template` has been moved to " "`vllm.renderers.hf.resolve_chat_template`. " "The old name will be removed in v0.16.", DeprecationWarning, stacklevel=2, ) return resolve_chat_template raise AttributeError(f"module {__name__!r} has no attribute {name!r}") class ChatTemplateResolutionError(ValueError): """Raised when chat template resolution fails. This is a subclass of ValueError for backward compatibility with existing exception handlers. """ MODALITY_PLACEHOLDERS_MAP = { "image": "<##IMAGE##>", "audio": "<##AUDIO##>", "video": "<##VIDEO##>", } class AudioURL(TypedDict, total=False): url: Required[str] """ Either a URL of the audio or a data URL with base64 encoded audio data. """ class ChatCompletionContentPartAudioParam(TypedDict, total=False): audio_url: Required[AudioURL] type: Required[Literal["audio_url"]] """The type of the content part.""" class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): image_embeds: str | dict[str, str] | None """ The image embeddings. It can be either: - A single base64 string. - A dictionary where each value is a base64 string. """ type: Required[Literal["image_embeds"]] """The type of the content part.""" uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False): audio_embeds: str | dict[str, str] | None """ The audio embeddings. It can be either: - A single base64 string representing a serialized torch tensor. - A dictionary where each value is a base64 string. """ type: Required[Literal["audio_embeds"]] """The type of the content part.""" uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class VideoURL(TypedDict, total=False): url: Required[str] """ Either a URL of the video or a data URL with base64 encoded video data. """ class ChatCompletionContentPartVideoParam(TypedDict, total=False): video_url: Required[VideoURL] type: Required[Literal["video_url"]] """The type of the content part.""" class PILImage(BaseModel): """ A PIL.Image.Image object. """ image_pil: Image.Image model_config = ConfigDict(arbitrary_types_allowed=True) class CustomChatCompletionContentPILImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a PIL image. Example: { "image_pil": ImageAsset('cherry_blossom').pil_image } """ image_pil: PILImage | None uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. Example: { "image_url": "https://example.com/image.jpg" } """ image_url: str | None uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. Example: { "audio_url": "https://example.com/audio.mp3" } """ audio_url: str | None class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. Example: { "video_url": "https://example.com/video.mp4" } """ video_url: str | None uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class CustomThinkCompletionContentParam(TypedDict, total=False): """A Think Completion Content Param that accepts a plain text and a boolean. Example: { "thinking": "I am thinking about the answer", "closed": True, "type": "thinking" } """ thinking: Required[str] """The thinking content.""" closed: bool """Whether the thinking is closed.""" type: Required[Literal["thinking"]] """The thinking type.""" ChatCompletionContentPartParam: TypeAlias = ( OpenAIChatCompletionContentPartParam | ChatCompletionContentPartAudioParam | ChatCompletionContentPartInputAudioParam | ChatCompletionContentPartVideoParam | ChatCompletionContentPartRefusalParam | CustomChatCompletionContentPILImageParam | CustomChatCompletionContentSimpleImageParam | ChatCompletionContentPartImageEmbedsParam | ChatCompletionContentPartAudioEmbedsParam | CustomChatCompletionContentSimpleAudioParam | CustomChatCompletionContentSimpleVideoParam | str | CustomThinkCompletionContentParam ) class CustomChatCompletionMessageParam(TypedDict, total=False): """Enables custom roles in the Chat Completion API.""" role: Required[str] """The role of the message's author.""" content: str | list[ChatCompletionContentPartParam] """The contents of the message.""" name: str """An optional name for the participant. Provides the model information to differentiate between participants of the same role. """ tool_call_id: str | None """Tool call that this message is responding to.""" tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None """The tool calls generated by the model, such as function calls.""" reasoning: str | None """The reasoning content for interleaved thinking.""" tools: list[ChatCompletionFunctionToolParam] | None """The tools for developer role.""" ChatCompletionMessageParam: TypeAlias = ( OpenAIChatCompletionMessageParam | CustomChatCompletionMessageParam | OpenAIHarmonyMessage ) # TODO: Make fields ReadOnly once mypy supports it class ConversationMessage(TypedDict, total=False): role: Required[str] """The role of the message's author.""" content: str | None | list[dict[str, str]] """The contents of the message""" tool_call_id: str | None """Tool call that this message is responding to.""" name: str | None """The name of the function to call""" tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None """The tool calls generated by the model, such as function calls.""" reasoning: str | None """The reasoning content for interleaved thinking.""" reasoning_content: str | None """Deprecated: The reasoning content for interleaved thinking.""" tools: list[ChatCompletionFunctionToolParam] | None """The tools for developer role.""" # Passed in by user ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] # After resolving "auto" ChatTemplateContentFormat = Literal["string", "openai"] ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"] _T = TypeVar("_T") def _extract_embeds(tensors: list[torch.Tensor]): if len(tensors) == 0: return tensors if len(tensors) == 1: tensors[0]._is_single_item = True # type: ignore return tensors[0] # To keep backwards compatibility for single item input first_shape = tensors[0].shape if all(t.shape == first_shape for t in tensors): return torch.stack(tensors) return tensors def _get_embeds_data(items_by_modality: dict[str, list[Any]], modality: str): embeds_key = f"{modality}_embeds" embeds = items_by_modality[embeds_key] if len(embeds) == 0: return embeds if is_list_of(embeds, torch.Tensor): return _extract_embeds(embeds) if is_list_of(embeds, dict): if not embeds: return {} first_keys = set(embeds[0].keys()) if any(set(item.keys()) != first_keys for item in embeds[1:]): raise ValueError( "All dictionaries in the list of embeddings must have the same keys." ) return {k: _extract_embeds([item[k] for item in embeds]) for k in first_keys} return embeds class BaseMultiModalItemTracker(ABC, Generic[_T]): """ Tracks multi-modal items in a given request and ensures that the number of multi-modal items in a given request does not exceed the configured maximum per prompt. """ def __init__(self, model_config: ModelConfig): super().__init__() self._model_config = model_config self._items_by_modality = defaultdict[str, list[_T | None]](list) self._uuids_by_modality = defaultdict[str, list[str | None]](list) @property def model_config(self) -> ModelConfig: return self._model_config @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls model_cls = get_model_cls(self.model_config) return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): return self._model_config.allowed_local_media_path @property def allowed_media_domains(self): return self._model_config.allowed_media_domains @property def mm_registry(self): return MULTIMODAL_REGISTRY @cached_property def mm_processor(self): return self.mm_registry.create_processor(self.model_config) def add( self, modality: ModalityStr, item: _T | None, uuid: str | None = None, ) -> str | None: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. An optional uuid can be added which serves as a unique identifier of the media. """ input_modality = modality.replace("_embeds", "") num_items = len(self._items_by_modality[modality]) + 1 self.mm_processor.validate_num_items(input_modality, num_items) self._items_by_modality[modality].append(item) self._uuids_by_modality[modality].append(uuid) return self.model_cls.get_placeholder_str(modality, num_items) def all_mm_uuids(self) -> MultiModalUUIDDict | None: if not self._items_by_modality: return None uuids_by_modality = dict(self._uuids_by_modality) if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") if "audio" in uuids_by_modality and "audio_embeds" in uuids_by_modality: raise ValueError("Mixing raw audio and embedding inputs is not allowed") mm_uuids = {} if "image_embeds" in uuids_by_modality: mm_uuids["image"] = uuids_by_modality["image_embeds"] if "image" in uuids_by_modality: mm_uuids["image"] = uuids_by_modality["image"] # UUIDs of images if "audio_embeds" in uuids_by_modality: mm_uuids["audio"] = uuids_by_modality["audio_embeds"] if "audio" in uuids_by_modality: mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios if "video" in uuids_by_modality: mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos return mm_uuids @abstractmethod def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError class MultiModalItemTracker(BaseMultiModalItemTracker[object]): def all_mm_data(self) -> MultiModalDataDict | None: if not self._items_by_modality: return None items_by_modality = dict(self._items_by_modality) if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") if "audio" in items_by_modality and "audio_embeds" in items_by_modality: raise ValueError("Mixing raw audio and embedding inputs is not allowed") mm_inputs = {} if "image_embeds" in items_by_modality: mm_inputs["image"] = _get_embeds_data(items_by_modality, "image") if "image" in items_by_modality: mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio_embeds" in items_by_modality: mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio") if "audio" in items_by_modality: mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: mm_inputs["video"] = items_by_modality["video"] # A list of videos return mm_inputs def create_parser(self) -> "BaseMultiModalContentParser": return MultiModalContentParser(self) class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): async def all_mm_data(self) -> MultiModalDataDict | None: if not self._items_by_modality: return None coros_by_modality = { modality: [item or asyncio.sleep(0) for item in items] for modality, items in self._items_by_modality.items() } items_by_modality: dict[str, list[object | None]] = { modality: await asyncio.gather(*coros) for modality, coros in coros_by_modality.items() } if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") if "audio" in items_by_modality and "audio_embeds" in items_by_modality: raise ValueError("Mixing raw audio and embedding inputs is not allowed") mm_inputs = {} if "image_embeds" in items_by_modality: mm_inputs["image"] = _get_embeds_data(items_by_modality, "image") if "image" in items_by_modality: mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio_embeds" in items_by_modality: mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio") if "audio" in items_by_modality: mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: mm_inputs["video"] = items_by_modality["video"] # A list of videos return mm_inputs def create_parser(self) -> "BaseMultiModalContentParser": return AsyncMultiModalContentParser(self) class BaseMultiModalContentParser(ABC): def __init__(self) -> None: super().__init__() # stores model placeholders list with corresponding # general MM placeholder: # { # "<##IMAGE##>": ["", "", ""], # "<##AUDIO##>": ["