"vscode:/vscode.git/clone" did not exist on "4078052f09f42f898b542e18d60d15a43db67a8b"
Unverified Commit ba2f0acc authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Reorganize inputs (#35182)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 678b3c99
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .data import (
from .engine import (
DecoderOnlyEngineInput,
EmbedsInput,
EncoderDecoderInput,
EngineInput,
MultiModalEncDecInput,
MultiModalHashes,
MultiModalInput,
MultiModalPlaceholders,
SingletonInput,
TokensInput,
build_enc_dec_input,
embeds_input,
mm_enc_dec_input,
mm_input,
split_enc_dec_input,
tokens_input,
)
from .llm import (
DataPrompt,
DecoderOnlyInputs,
EmbedsInputs,
EmbedsPrompt,
EncoderDecoderInputs,
ExplicitEncoderDecoderPrompt,
ProcessorInputs,
ModalityData,
MultiModalDataBuiltins,
MultiModalDataDict,
MultiModalUUIDDict,
PromptType,
SingletonInputs,
SingletonPrompt,
TextPrompt,
TokenInputs,
TokensPrompt,
embeds_inputs,
token_inputs,
)
__all__ = [
"ModalityData",
"MultiModalDataBuiltins",
"MultiModalDataDict",
"MultiModalUUIDDict",
"DataPrompt",
"TextPrompt",
"TokensPrompt",
"PromptType",
"SingletonPrompt",
"ExplicitEncoderDecoderPrompt",
"TokenInputs",
"EmbedsInputs",
"EmbedsPrompt",
"token_inputs",
"embeds_inputs",
"DecoderOnlyInputs",
"EncoderDecoderInputs",
"ProcessorInputs",
"SingletonInputs",
"MultiModalHashes",
"MultiModalPlaceholders",
"TokensInput",
"EmbedsInput",
"MultiModalInput",
"MultiModalEncDecInput",
"tokens_input",
"embeds_input",
"mm_input",
"mm_enc_dec_input",
"build_enc_dec_input",
"split_enc_dec_input",
"DecoderOnlyEngineInput",
"EncoderDecoderInput",
"SingletonInput",
"EngineInput",
]
"""Schema and utilities for inputs to the engine client (`LLMEngine`/`AsyncLLM`)."""
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping, Sequence
from typing import TYPE_CHECKING, Literal, TypeAlias
from typing_extensions import NotRequired, TypedDict, assert_never
if TYPE_CHECKING:
import torch
from vllm.multimodal.inputs import MultiModalKwargsOptionalItems, PlaceholderRange
class _InputOptions(TypedDict):
"""
Additional options available to all
[`SingletonInput`][vllm.inputs.engine.SingletonInput] types.
"""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
cache_salt: NotRequired[str]
"""Optional cache salt to be used for prefix caching."""
class TokensInput(_InputOptions):
"""Represents token-based input to the engine."""
type: Literal["token"]
"""The type of input."""
prompt_token_ids: list[int]
"""The token IDs of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def tokens_input(
prompt_token_ids: list[int],
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> TokensInput:
"""
Construct [`TokensInput`][vllm.inputs.engine.TokensInput]
from optional values.
"""
inputs = TokensInput(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class EmbedsInput(_InputOptions):
"""Represents embeddings-based input to the engine."""
type: Literal["embeds"]
"""The type of input."""
prompt_embeds: "torch.Tensor"
"""The embeddings of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def embeds_input(
prompt_embeds: "torch.Tensor",
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> EmbedsInput:
"""
Construct [`EmbedsInput`][vllm.inputs.engine.EmbedsInput]
from optional values.
"""
inputs = EmbedsInput(type="embeds", prompt_embeds=prompt_embeds)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
MultiModalHashes: TypeAlias = Mapping[str, list[str]]
"""
A dictionary containing per-item hashes for each modality.
"""
MultiModalPlaceholders: TypeAlias = Mapping[str, Sequence["PlaceholderRange"]]
"""
A dictionary containing per-item placeholder ranges for each modality.
"""
class MultiModalInput(_InputOptions):
"""Represents multi-modal input to the engine."""
type: Literal["multimodal"]
"""The type of input."""
prompt_token_ids: list[int]
"""The processed token IDs which includes placeholder tokens."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
mm_kwargs: "MultiModalKwargsOptionalItems"
"""Keyword arguments to be directly passed to the model after batching."""
mm_hashes: MultiModalHashes
"""The hashes of the multi-modal data."""
mm_placeholders: MultiModalPlaceholders
"""
For each modality, information about the placeholder tokens in
`prompt_token_ids`.
"""
def mm_input(
prompt_token_ids: list[int],
mm_kwargs: "MultiModalKwargsOptionalItems",
mm_hashes: MultiModalHashes,
mm_placeholders: MultiModalPlaceholders,
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> MultiModalInput:
inputs = MultiModalInput(
type="multimodal",
prompt_token_ids=prompt_token_ids,
mm_kwargs=mm_kwargs,
mm_hashes=mm_hashes,
mm_placeholders=mm_placeholders,
)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class MultiModalEncDecInput(MultiModalInput):
"""
Represents multi-modal input to the engine for encoder-decoder models.
Note:
Even text-only encoder-decoder models are currently implemented
as multi-modal models for convenience.
(Example: https://github.com/vllm-project/bart-plugin)
"""
encoder_prompt_token_ids: list[int]
"""The processed token IDs of the encoder prompt."""
encoder_prompt: NotRequired[str]
"""The prompt text corresponding to the encoder token IDs, if available."""
def mm_enc_dec_input(
encoder_inputs: MultiModalInput,
decoder_prompt_token_ids: list[int],
*,
decoder_prompt: str | None = None,
) -> MultiModalEncDecInput:
inputs = MultiModalEncDecInput(
type="multimodal",
prompt_token_ids=decoder_prompt_token_ids,
encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
mm_kwargs=encoder_inputs["mm_kwargs"],
mm_hashes=encoder_inputs["mm_hashes"],
mm_placeholders=encoder_inputs["mm_placeholders"],
)
if decoder_prompt is not None:
inputs["prompt"] = decoder_prompt
if "prompt" in encoder_inputs:
inputs["encoder_prompt"] = encoder_inputs["prompt"]
if "cache_salt" in encoder_inputs:
inputs["cache_salt"] = encoder_inputs["cache_salt"]
return inputs
DecoderOnlyEngineInput: TypeAlias = TokensInput | EmbedsInput | MultiModalInput
"""
A rendered [`DecoderOnlyPrompt`][vllm.inputs.llm.DecoderOnlyPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
EncoderInput: TypeAlias = TokensInput | MultiModalEncDecInput
"""
A rendered [`EncoderPrompt`][vllm.inputs.llm.EncoderPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
DecoderEngineInput: TypeAlias = TokensInput | MultiModalInput
"""
A rendered [`DecoderPrompt`][vllm.inputs.llm.DecoderPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
class EncoderDecoderInput(TypedDict):
"""
A rendered [`EncoderDecoderPrompt`][vllm.inputs.llm.EncoderDecoderPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
type: Literal["enc_dec"]
encoder_prompt: EncoderInput
"""The inputs for the encoder portion."""
decoder_prompt: DecoderEngineInput
"""The inputs for the decoder portion."""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
SingletonInput: TypeAlias = DecoderOnlyEngineInput | MultiModalEncDecInput
"""
A rendered [`SingletonPrompt`][vllm.inputs.llm.SingletonPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
EngineInput: TypeAlias = DecoderOnlyEngineInput | EncoderDecoderInput
"""
A rendered [`PromptType`][vllm.inputs.llm.PromptType]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
def _validate_enc_input(enc_input: SingletonInput) -> EncoderInput:
if enc_input["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
if (
enc_input["type"] == "multimodal"
and "encoder_prompt_token_ids" not in enc_input
):
raise RuntimeError(
"You should register an encoder-decoder multi-modal processor "
"for encoder-decoder models."
)
return enc_input # type: ignore[return-value]
def _validate_dec_input(dec_input: SingletonInput) -> DecoderEngineInput:
if dec_input["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
return dec_input
def _prepare_decoder_input_ids_for_generation(
decoder_input_ids: list[int],
decoder_start_token_id: int,
) -> list[int]:
"""
Prepare `decoder_input_ids` for generation with encoder-decoder models,
according to `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
Source:
https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/generation/utils.py
"""
if len(decoder_input_ids) == 0 or decoder_input_ids[0] != decoder_start_token_id:
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
return decoder_input_ids
def build_enc_dec_input(
encoder_input: SingletonInput,
decoder_input: SingletonInput | None,
decoder_start_token_id: int,
skip_decoder_start_token: bool = False,
) -> EncoderDecoderInput:
enc_input = _validate_enc_input(encoder_input)
if decoder_input is None:
dec_input: DecoderEngineInput = enc_input
else:
dec_input = _validate_dec_input(decoder_input)
enc_input_new: EncoderInput
dec_input_new: DecoderEngineInput
if enc_input["type"] == "multimodal":
enc_input_new = tokens_input(
enc_input["encoder_prompt_token_ids"],
prompt=enc_input.get("encoder_prompt"),
)
dec_input_new = mm_input(
prompt_token_ids=dec_input["prompt_token_ids"],
prompt=dec_input.get("prompt"),
mm_kwargs=enc_input["mm_kwargs"],
mm_hashes=enc_input["mm_hashes"],
mm_placeholders=enc_input["mm_placeholders"],
)
elif enc_input["type"] == "token":
enc_input_new = tokens_input(prompt_token_ids=[])
dec_input_new = dec_input
else:
assert_never(enc_input)
if not skip_decoder_start_token:
dec_input_new["prompt_token_ids"] = _prepare_decoder_input_ids_for_generation(
dec_input_new["prompt_token_ids"],
decoder_start_token_id,
)
if cache_salt := enc_input.get("cache_salt"):
dec_input_new["cache_salt"] = cache_salt
return EncoderDecoderInput(
type="enc_dec",
encoder_prompt=enc_input_new,
decoder_prompt=dec_input_new,
)
def split_enc_dec_input(
inputs: EngineInput,
) -> tuple[SingletonInput | None, SingletonInput]:
if inputs["type"] == "enc_dec":
return inputs["encoder_prompt"], inputs["decoder_prompt"]
return None, inputs
"""Schema and utilities for input prompts to the LLM API."""
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, Literal, TypeAlias
from collections.abc import Mapping, Sequence
from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, final
import torch
from typing_extensions import NotRequired, TypedDict, assert_never
from typing_extensions import NotRequired, TypedDict
if TYPE_CHECKING:
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalEncDecInputs,
MultiModalInputs,
MultiModalUUIDDict,
)
else:
MultiModalDataDict = object
MultiModalEncDecInputs = object
MultiModalInputs = object
MultiModalUUIDDict = object
# Inputs to LLM API
import torch
from vllm.multimodal.inputs import AudioItem, ImageItem, VideoItem, VisionChunk
_T = TypeVar("_T")
ModalityData: TypeAlias = _T | list[_T | None] | None
"""
Either a single data item, or a list of data items. Can only be None if UUID
is provided.
The number of data items allowed per modality is restricted by
`--limit-mm-per-prompt`.
"""
@final
class MultiModalDataBuiltins(TypedDict, total=False):
"""Type annotations for modality types predefined by vLLM."""
image: ModalityData["ImageItem"]
"""The input image(s)."""
video: ModalityData["VideoItem"]
"""The input video(s)."""
audio: ModalityData["AudioItem"]
"""The input audio(s)."""
vision_chunk: ModalityData["VisionChunk"]
"""The input visual atom(s) - unified modality for images and video chunks."""
MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
"""
A dictionary containing an entry for each modality type to input.
The built-in modalities are defined by
[`MultiModalDataBuiltins`][vllm.inputs.llm.MultiModalDataBuiltins].
"""
MultiModalUUIDDict: TypeAlias = Mapping[str, Sequence[str | None] | str]
"""
A dictionary containing user-provided UUIDs for items in each modality.
If a UUID for an item is not provided, its entry will be `None` and
MultiModalHasher will compute a hash for the item.
The UUID will be used to identify the item for all caching purposes
(input processing caching, embedding caching, prefix caching, etc).
"""
class _PromptOptions(TypedDict):
"""
Additional options available to all
[`SingletonPrompt`][vllm.inputs.data.SingletonPrompt].
[`SingletonPrompt`][vllm.inputs.llm.SingletonPrompt] types.
"""
multi_modal_data: NotRequired[MultiModalDataDict | None]
......@@ -78,7 +119,7 @@ class TokensPrompt(_PromptOptions):
class EmbedsPrompt(_PromptOptions):
"""Schema for a prompt provided via token embeddings."""
prompt_embeds: torch.Tensor
prompt_embeds: "torch.Tensor"
"""The embeddings of the prompt."""
prompt: NotRequired[str]
......@@ -91,10 +132,10 @@ DecoderOnlyPrompt: TypeAlias = (
"""
Schema of a prompt for a decoder-only model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A text prompt (string or [`TextPrompt`][vllm.inputs.llm.TextPrompt])
- A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
[`TokensPrompt`][vllm.inputs.llm.TokensPrompt])
- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.llm.EmbedsPrompt])
For encoder-decoder models, passing a singleton prompt is shorthand for passing
`ExplicitEncoderDecoderPrompt(encoder_prompt=prompt, decoder_prompt=None)`.
......@@ -105,9 +146,9 @@ EncoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt
"""
Schema of a prompt for the encoder part of a encoder-decoder model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A text prompt (string or [`TextPrompt`][vllm.inputs.llm.TextPrompt])
- A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt])
[`TokensPrompt`][vllm.inputs.llm.TokensPrompt])
"""
......@@ -115,9 +156,9 @@ DecoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt
"""
Schema of a prompt for the decoder part of an encoder-decoder model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A text prompt (string or [`TextPrompt`][vllm.inputs.llm.TextPrompt])
- A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt])
[`TokensPrompt`][vllm.inputs.llm.TokensPrompt])
Note:
Multi-modal inputs are not supported for decoder prompts.
......@@ -156,7 +197,7 @@ SingletonPrompt: TypeAlias = DecoderOnlyPrompt | EncoderPrompt | DecoderPrompt
"""
Schema for a single prompt. This is as opposed to a data structure
which encapsulates multiple prompts, such as
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt].
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.llm.ExplicitEncoderDecoderPrompt].
"""
......@@ -171,7 +212,7 @@ This is the input format accepted by most [`LLM`][vllm.entrypoints.llm.LLM] APIs
class DataPrompt(_PromptOptions):
"""
Represents generic inputs that are converted to
[`PromptType`][vllm.inputs.data.PromptType] by IO processor plugins.
[`PromptType`][vllm.inputs.llm.PromptType] by IO processor plugins.
"""
data: Any
......@@ -179,235 +220,3 @@ class DataPrompt(_PromptOptions):
data_format: str
"""The input data format."""
# Outputs of processor
class _InputOptions(TypedDict):
"""
Additional options available to all input types.
"""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
cache_salt: NotRequired[str]
"""Optional cache salt to be used for prefix caching."""
class TokenInputs(_InputOptions):
"""Represents token-based inputs."""
type: Literal["token"]
"""The type of inputs."""
prompt_token_ids: list[int]
"""The token IDs of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def token_inputs(
prompt_token_ids: list[int],
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> TokenInputs:
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class EmbedsInputs(_InputOptions):
"""Represents embeddings-based inputs."""
type: Literal["embeds"]
"""The type of inputs."""
prompt_embeds: torch.Tensor
"""The embeddings of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def embeds_inputs(
prompt_embeds: torch.Tensor,
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> EmbedsInputs:
"""Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
values."""
inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
DecoderOnlyInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
"""
A processed prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for decoder-only models.
"""
EncoderInputs: TypeAlias = TokenInputs | MultiModalEncDecInputs
"""
A processed encoder prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
DecoderInputs: TypeAlias = TokenInputs | MultiModalInputs
"""
A processed decoder prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
class EncoderDecoderInputs(TypedDict):
"""
A processed pair of encoder and decoder singleton prompts.
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
type: Literal["enc_dec"]
encoder_prompt: EncoderInputs
"""The inputs for the encoder portion."""
decoder_prompt: DecoderInputs
"""The inputs for the decoder portion."""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
ProcessorInputs: TypeAlias = DecoderOnlyInputs | EncoderDecoderInputs
"""
A processed prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor].
"""
SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs
"""The inputs for a single encoder/decoder prompt."""
def _validate_enc_inputs(inputs: SingletonInputs) -> EncoderInputs:
if inputs["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
if inputs["type"] == "multimodal" and "encoder_prompt_token_ids" not in inputs:
raise RuntimeError(
"You should register an encoder-decoder multi-modal processor "
"for encoder-decoder models."
)
return inputs # type: ignore[return-value]
def _validate_dec_inputs(inputs: SingletonInputs) -> DecoderInputs:
if inputs["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
return inputs
def _prepare_decoder_input_ids_for_generation(
decoder_input_ids: list[int],
decoder_start_token_id: int,
) -> list[int]:
"""
Prepare `decoder_input_ids` for generation with encoder-decoder models,
according to `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
Source:
https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/generation/utils.py
"""
if len(decoder_input_ids) == 0 or decoder_input_ids[0] != decoder_start_token_id:
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
return decoder_input_ids
def build_enc_dec_inputs(
encoder_inputs: SingletonInputs,
decoder_inputs: SingletonInputs | None,
decoder_start_token_id: int,
skip_decoder_start_token: bool = False,
) -> EncoderDecoderInputs:
enc_inputs = _validate_enc_inputs(encoder_inputs)
if decoder_inputs is None:
dec_inputs: DecoderInputs = enc_inputs
else:
dec_inputs = _validate_dec_inputs(decoder_inputs)
enc_inputs_new: EncoderInputs
dec_inputs_new: DecoderInputs
if enc_inputs["type"] == "multimodal":
from vllm.multimodal.inputs import mm_inputs
enc_inputs_new = token_inputs(
enc_inputs["encoder_prompt_token_ids"],
prompt=enc_inputs.get("encoder_prompt"),
)
dec_inputs_new = mm_inputs(
prompt_token_ids=dec_inputs["prompt_token_ids"],
prompt=dec_inputs.get("prompt"),
mm_kwargs=enc_inputs["mm_kwargs"],
mm_hashes=enc_inputs["mm_hashes"],
mm_placeholders=enc_inputs["mm_placeholders"],
)
elif enc_inputs["type"] == "token":
enc_inputs_new = token_inputs(prompt_token_ids=[])
dec_inputs_new = dec_inputs
else:
assert_never(enc_inputs)
if not skip_decoder_start_token:
dec_inputs_new["prompt_token_ids"] = _prepare_decoder_input_ids_for_generation(
dec_inputs_new["prompt_token_ids"],
decoder_start_token_id,
)
if cache_salt := enc_inputs.get("cache_salt"):
dec_inputs_new["cache_salt"] = cache_salt
return EncoderDecoderInputs(
type="enc_dec",
encoder_prompt=enc_inputs_new,
decoder_prompt=dec_inputs_new,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .data import ProcessorInputs, SingletonInputs
def split_enc_dec_inputs(
inputs: ProcessorInputs,
) -> tuple[SingletonInputs | None, SingletonInputs]:
if inputs["type"] == "enc_dec":
return inputs["encoder_prompt"], inputs["decoder_prompt"]
return None, inputs
......@@ -7,14 +7,9 @@ from typing import Any, overload
from typing_extensions import assert_never
from vllm.config import VllmConfig
from vllm.inputs.data import build_enc_dec_inputs
from vllm.inputs import build_enc_dec_input
from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalInputs,
MultiModalUUIDDict,
)
from vllm.renderers import BaseRenderer, renderer_from_config
from vllm.renderers.inputs import (
DecoderDictPrompt,
......@@ -26,20 +21,25 @@ from vllm.renderers.inputs import (
from vllm.renderers.inputs.preprocess import parse_dec_only_prompt, parse_enc_dec_prompt
from vllm.tokenizers import TokenizerLike
from .data import (
DecoderInputs,
DecoderOnlyInputs,
EmbedsInputs,
from .engine import (
DecoderEngineInput,
DecoderOnlyEngineInput,
EmbedsInput,
EncoderDecoderInput,
EncoderInput,
EngineInput,
MultiModalInput,
SingletonInput,
TokensInput,
tokens_input,
)
from .llm import (
EmbedsPrompt,
EncoderDecoderInputs,
EncoderInputs,
ProcessorInputs,
MultiModalDataDict,
MultiModalUUIDDict,
PromptType,
SingletonInputs,
TextPrompt,
TokenInputs,
TokensPrompt,
token_inputs,
)
logger = init_logger(__name__)
......@@ -95,7 +95,7 @@ class InputPreprocessor:
tokenization_kwargs: dict[str, Any] | None = None,
*,
mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
) -> MultiModalInput:
"""
Apply the model's multi-modal processor to a multi-modal prompt,
returning the corresponding token IDs and metadata.
......@@ -111,7 +111,7 @@ class InputPreprocessor:
def _process_embeds(
self,
parsed_content: EmbedsPrompt,
) -> EmbedsInputs:
) -> EmbedsInput:
return self.renderer._process_embeds(parsed_content)
def _truncate_inputs(
......@@ -134,12 +134,12 @@ class InputPreprocessor:
self,
parsed_content: TokensPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> TokenInputs | MultiModalInputs:
) -> TokensInput | MultiModalInput:
prompt_token_ids = self._truncate_inputs(
parsed_content["prompt_token_ids"], tokenization_kwargs
)
inputs: TokenInputs | MultiModalInputs
inputs: TokensInput | MultiModalInput
if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal(
prompt_token_ids,
......@@ -149,7 +149,7 @@ class InputPreprocessor:
mm_uuids=parsed_content.get("multi_modal_uuids"),
)
else:
inputs = token_inputs(prompt_token_ids)
inputs = tokens_input(prompt_token_ids)
if prompt_text := parsed_content.get("prompt"):
inputs["prompt"] = prompt_text
......@@ -162,10 +162,10 @@ class InputPreprocessor:
self,
parsed_content: TextPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> TokenInputs | MultiModalInputs:
) -> TokensInput | MultiModalInput:
prompt_text = parsed_content["prompt"]
inputs: TokenInputs | MultiModalInputs
inputs: TokensInput | MultiModalInput
if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal(
prompt_text,
......@@ -178,7 +178,7 @@ class InputPreprocessor:
prompt_text,
tokenization_kwargs=tokenization_kwargs,
)
inputs = token_inputs(prompt_token_ids)
inputs = tokens_input(prompt_token_ids)
inputs["prompt"] = prompt_text
......@@ -192,38 +192,27 @@ class InputPreprocessor:
self,
prompt: EncoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> EncoderInputs: ...
) -> EncoderInput: ...
@overload
def _prompt_to_llm_inputs( # type: ignore[misc]
self,
prompt: DecoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderInputs: ...
) -> DecoderEngineInput: ...
@overload
def _prompt_to_llm_inputs( # type: ignore[misc]
self,
prompt: DecoderOnlyDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderOnlyInputs: ...
) -> DecoderOnlyEngineInput: ...
def _prompt_to_llm_inputs(
self,
prompt: SingletonDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> SingletonInputs:
"""
Extract the singleton inputs from a prompt.
Arguments:
* prompt: single encoder or decoder input prompt
Returns:
* [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
"""
) -> SingletonInput:
if "prompt_embeds" in prompt:
return self._process_embeds(prompt) # type: ignore[arg-type]
......@@ -242,22 +231,7 @@ class InputPreprocessor:
self,
prompt: EncoderDecoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> EncoderDecoderInputs:
"""
For encoder/decoder models only:
Process an input prompt into an
[`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance.
Arguments:
* prompt: an input prompt
Returns:
* [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance
"""
) -> EncoderDecoderInput:
encoder_prompt = prompt["encoder_prompt"]
decoder_prompt = prompt["decoder_prompt"]
......@@ -270,12 +244,12 @@ class InputPreprocessor:
self.renderer.mm_processor.skip_decoder_start_token
)
return build_enc_dec_inputs(
encoder_inputs=self._prompt_to_llm_inputs(
return build_enc_dec_input(
encoder_input=self._prompt_to_llm_inputs(
encoder_prompt,
tokenization_kwargs=tokenization_kwargs,
),
decoder_inputs=(
decoder_input=(
None
if decoder_prompt is None
else self._prompt_to_llm_inputs(
......@@ -291,20 +265,7 @@ class InputPreprocessor:
self,
prompt: DecoderOnlyDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderOnlyInputs:
"""
For decoder-only models:
Process an input prompt into a
[`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
Arguments:
* prompt: input prompt
Returns:
* [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
"""
) -> DecoderOnlyEngineInput:
return self._prompt_to_llm_inputs(
prompt,
tokenization_kwargs=tokenization_kwargs,
......@@ -314,7 +275,7 @@ class InputPreprocessor:
self,
prompt: PromptType,
tokenization_kwargs: dict[str, Any] | None = None,
) -> ProcessorInputs:
) -> EngineInput:
"""Preprocess the input prompt."""
if self.model_config.is_encoder_decoder:
# Encoder-decoder model requires special mapping of
......
......@@ -12,6 +12,7 @@ from transformers.models.aria.processing_aria import AriaProcessor
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
......@@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import (
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -31,17 +31,16 @@ from transformers.models.qwen2_audio import Qwen2AudioEncoder
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import ModalityData, MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
DictEmbeddingItems,
ModalityData,
ModalityDataItems,
MultiModalDataItems,
MultiModalDataParser,
......
......@@ -17,9 +17,9 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -15,6 +15,7 @@ import torch.nn as nn
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (
......@@ -24,7 +25,6 @@ from vllm.model_executor.layers.linear import (
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -9,8 +9,8 @@ from transformers.activations import GELUActivation
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict
from .llava_next import (
LlavaDummyInputsBuilder,
......
......@@ -15,11 +15,11 @@ from transformers import (
from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -19,6 +19,7 @@ from transformers import (
from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention
......@@ -43,7 +44,6 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -17,6 +17,7 @@ from transformers import (
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import divide, get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict, MultiModalInput
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
from vllm.model_executor.layers.conv import Conv2dLayer
......@@ -32,9 +33,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
......@@ -207,7 +206,7 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
self,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
) -> MultiModalInput:
if inputs.mm_data_items:
if isinstance(inputs.prompt, str):
if len(inputs.prompt) > 0:
......
......@@ -19,6 +19,7 @@ from transformers.models.cohere2_vision.processing_cohere2_vision import (
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import MulAndSilu
from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear,
......@@ -28,7 +29,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -3,7 +3,7 @@
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import Literal, cast
from typing import Literal
import numpy as np
import torch
......@@ -14,7 +14,7 @@ from transformers import PretrainedConfig
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType
from vllm.inputs import MultiModalDataDict, PromptType, TextPrompt
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import (
......@@ -32,7 +32,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......@@ -2047,14 +2046,11 @@ class CohereASRForConditionalGeneration(
f"<|noitn|><|notimestamp|><|nodiarize|>"
)
prompt_text = request_prompt if request_prompt else default_prompt
prompt = {
"prompt": prompt_text,
"multi_modal_data": {
"audio": (audio, stt_config.sample_rate),
},
}
return cast(PromptType, prompt)
return TextPrompt(
prompt=prompt_text,
multi_modal_data={"audio": (audio, stt_config.sample_rate)},
)
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
......
......@@ -16,11 +16,11 @@ from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -12,6 +12,7 @@ from transformers import BatchFeature, CLIPVisionConfig
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.models.interfaces import (
MultiModalEmbeddings,
SupportsLoRA,
......@@ -27,7 +28,6 @@ from vllm.model_executor.models.utils import (
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
NestedTensors,
......
......@@ -12,6 +12,7 @@ from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.models.interfaces import (
MultiModalEmbeddings,
SupportsLoRA,
......@@ -27,7 +28,6 @@ from vllm.model_executor.models.utils import (
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
NestedTensors,
......
......@@ -17,11 +17,11 @@ from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.transformers.utils import replace_linear_class
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -15,6 +15,7 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import (
MMEncoderAttention,
......@@ -54,7 +55,6 @@ from vllm.model_executor.models.utils import (
)
from vllm.model_executor.models.vision import get_vit_attn_backend
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig
from vllm.utils.tensor_schema import TensorSchema, TensorShape
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment