Unverified Commit ba2f0acc authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Reorganize inputs (#35182)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 678b3c99
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .data import ( from .engine import (
DecoderOnlyEngineInput,
EmbedsInput,
EncoderDecoderInput,
EngineInput,
MultiModalEncDecInput,
MultiModalHashes,
MultiModalInput,
MultiModalPlaceholders,
SingletonInput,
TokensInput,
build_enc_dec_input,
embeds_input,
mm_enc_dec_input,
mm_input,
split_enc_dec_input,
tokens_input,
)
from .llm import (
DataPrompt, DataPrompt,
DecoderOnlyInputs,
EmbedsInputs,
EmbedsPrompt, EmbedsPrompt,
EncoderDecoderInputs,
ExplicitEncoderDecoderPrompt, ExplicitEncoderDecoderPrompt,
ProcessorInputs, ModalityData,
MultiModalDataBuiltins,
MultiModalDataDict,
MultiModalUUIDDict,
PromptType, PromptType,
SingletonInputs,
SingletonPrompt, SingletonPrompt,
TextPrompt, TextPrompt,
TokenInputs,
TokensPrompt, TokensPrompt,
embeds_inputs,
token_inputs,
) )
__all__ = [ __all__ = [
"ModalityData",
"MultiModalDataBuiltins",
"MultiModalDataDict",
"MultiModalUUIDDict",
"DataPrompt", "DataPrompt",
"TextPrompt", "TextPrompt",
"TokensPrompt", "TokensPrompt",
"PromptType", "PromptType",
"SingletonPrompt", "SingletonPrompt",
"ExplicitEncoderDecoderPrompt", "ExplicitEncoderDecoderPrompt",
"TokenInputs",
"EmbedsInputs",
"EmbedsPrompt", "EmbedsPrompt",
"token_inputs", "MultiModalHashes",
"embeds_inputs", "MultiModalPlaceholders",
"DecoderOnlyInputs", "TokensInput",
"EncoderDecoderInputs", "EmbedsInput",
"ProcessorInputs", "MultiModalInput",
"SingletonInputs", "MultiModalEncDecInput",
"tokens_input",
"embeds_input",
"mm_input",
"mm_enc_dec_input",
"build_enc_dec_input",
"split_enc_dec_input",
"DecoderOnlyEngineInput",
"EncoderDecoderInput",
"SingletonInput",
"EngineInput",
] ]
"""Schema and utilities for inputs to the engine client (`LLMEngine`/`AsyncLLM`)."""
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping, Sequence
from typing import TYPE_CHECKING, Literal, TypeAlias
from typing_extensions import NotRequired, TypedDict, assert_never
if TYPE_CHECKING:
import torch
from vllm.multimodal.inputs import MultiModalKwargsOptionalItems, PlaceholderRange
class _InputOptions(TypedDict):
"""
Additional options available to all
[`SingletonInput`][vllm.inputs.engine.SingletonInput] types.
"""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
cache_salt: NotRequired[str]
"""Optional cache salt to be used for prefix caching."""
class TokensInput(_InputOptions):
"""Represents token-based input to the engine."""
type: Literal["token"]
"""The type of input."""
prompt_token_ids: list[int]
"""The token IDs of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def tokens_input(
prompt_token_ids: list[int],
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> TokensInput:
"""
Construct [`TokensInput`][vllm.inputs.engine.TokensInput]
from optional values.
"""
inputs = TokensInput(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class EmbedsInput(_InputOptions):
"""Represents embeddings-based input to the engine."""
type: Literal["embeds"]
"""The type of input."""
prompt_embeds: "torch.Tensor"
"""The embeddings of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def embeds_input(
prompt_embeds: "torch.Tensor",
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> EmbedsInput:
"""
Construct [`EmbedsInput`][vllm.inputs.engine.EmbedsInput]
from optional values.
"""
inputs = EmbedsInput(type="embeds", prompt_embeds=prompt_embeds)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
MultiModalHashes: TypeAlias = Mapping[str, list[str]]
"""
A dictionary containing per-item hashes for each modality.
"""
MultiModalPlaceholders: TypeAlias = Mapping[str, Sequence["PlaceholderRange"]]
"""
A dictionary containing per-item placeholder ranges for each modality.
"""
class MultiModalInput(_InputOptions):
"""Represents multi-modal input to the engine."""
type: Literal["multimodal"]
"""The type of input."""
prompt_token_ids: list[int]
"""The processed token IDs which includes placeholder tokens."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
mm_kwargs: "MultiModalKwargsOptionalItems"
"""Keyword arguments to be directly passed to the model after batching."""
mm_hashes: MultiModalHashes
"""The hashes of the multi-modal data."""
mm_placeholders: MultiModalPlaceholders
"""
For each modality, information about the placeholder tokens in
`prompt_token_ids`.
"""
def mm_input(
prompt_token_ids: list[int],
mm_kwargs: "MultiModalKwargsOptionalItems",
mm_hashes: MultiModalHashes,
mm_placeholders: MultiModalPlaceholders,
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> MultiModalInput:
inputs = MultiModalInput(
type="multimodal",
prompt_token_ids=prompt_token_ids,
mm_kwargs=mm_kwargs,
mm_hashes=mm_hashes,
mm_placeholders=mm_placeholders,
)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class MultiModalEncDecInput(MultiModalInput):
"""
Represents multi-modal input to the engine for encoder-decoder models.
Note:
Even text-only encoder-decoder models are currently implemented
as multi-modal models for convenience.
(Example: https://github.com/vllm-project/bart-plugin)
"""
encoder_prompt_token_ids: list[int]
"""The processed token IDs of the encoder prompt."""
encoder_prompt: NotRequired[str]
"""The prompt text corresponding to the encoder token IDs, if available."""
def mm_enc_dec_input(
encoder_inputs: MultiModalInput,
decoder_prompt_token_ids: list[int],
*,
decoder_prompt: str | None = None,
) -> MultiModalEncDecInput:
inputs = MultiModalEncDecInput(
type="multimodal",
prompt_token_ids=decoder_prompt_token_ids,
encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
mm_kwargs=encoder_inputs["mm_kwargs"],
mm_hashes=encoder_inputs["mm_hashes"],
mm_placeholders=encoder_inputs["mm_placeholders"],
)
if decoder_prompt is not None:
inputs["prompt"] = decoder_prompt
if "prompt" in encoder_inputs:
inputs["encoder_prompt"] = encoder_inputs["prompt"]
if "cache_salt" in encoder_inputs:
inputs["cache_salt"] = encoder_inputs["cache_salt"]
return inputs
DecoderOnlyEngineInput: TypeAlias = TokensInput | EmbedsInput | MultiModalInput
"""
A rendered [`DecoderOnlyPrompt`][vllm.inputs.llm.DecoderOnlyPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
EncoderInput: TypeAlias = TokensInput | MultiModalEncDecInput
"""
A rendered [`EncoderPrompt`][vllm.inputs.llm.EncoderPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
DecoderEngineInput: TypeAlias = TokensInput | MultiModalInput
"""
A rendered [`DecoderPrompt`][vllm.inputs.llm.DecoderPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
class EncoderDecoderInput(TypedDict):
"""
A rendered [`EncoderDecoderPrompt`][vllm.inputs.llm.EncoderDecoderPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
type: Literal["enc_dec"]
encoder_prompt: EncoderInput
"""The inputs for the encoder portion."""
decoder_prompt: DecoderEngineInput
"""The inputs for the decoder portion."""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
SingletonInput: TypeAlias = DecoderOnlyEngineInput | MultiModalEncDecInput
"""
A rendered [`SingletonPrompt`][vllm.inputs.llm.SingletonPrompt]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
EngineInput: TypeAlias = DecoderOnlyEngineInput | EncoderDecoderInput
"""
A rendered [`PromptType`][vllm.inputs.llm.PromptType]
which can be passed to `LLMEngine.add_request` or `AsyncLLM.add_request`.
"""
def _validate_enc_input(enc_input: SingletonInput) -> EncoderInput:
if enc_input["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
if (
enc_input["type"] == "multimodal"
and "encoder_prompt_token_ids" not in enc_input
):
raise RuntimeError(
"You should register an encoder-decoder multi-modal processor "
"for encoder-decoder models."
)
return enc_input # type: ignore[return-value]
def _validate_dec_input(dec_input: SingletonInput) -> DecoderEngineInput:
if dec_input["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
return dec_input
def _prepare_decoder_input_ids_for_generation(
decoder_input_ids: list[int],
decoder_start_token_id: int,
) -> list[int]:
"""
Prepare `decoder_input_ids` for generation with encoder-decoder models,
according to `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
Source:
https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/generation/utils.py
"""
if len(decoder_input_ids) == 0 or decoder_input_ids[0] != decoder_start_token_id:
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
return decoder_input_ids
def build_enc_dec_input(
encoder_input: SingletonInput,
decoder_input: SingletonInput | None,
decoder_start_token_id: int,
skip_decoder_start_token: bool = False,
) -> EncoderDecoderInput:
enc_input = _validate_enc_input(encoder_input)
if decoder_input is None:
dec_input: DecoderEngineInput = enc_input
else:
dec_input = _validate_dec_input(decoder_input)
enc_input_new: EncoderInput
dec_input_new: DecoderEngineInput
if enc_input["type"] == "multimodal":
enc_input_new = tokens_input(
enc_input["encoder_prompt_token_ids"],
prompt=enc_input.get("encoder_prompt"),
)
dec_input_new = mm_input(
prompt_token_ids=dec_input["prompt_token_ids"],
prompt=dec_input.get("prompt"),
mm_kwargs=enc_input["mm_kwargs"],
mm_hashes=enc_input["mm_hashes"],
mm_placeholders=enc_input["mm_placeholders"],
)
elif enc_input["type"] == "token":
enc_input_new = tokens_input(prompt_token_ids=[])
dec_input_new = dec_input
else:
assert_never(enc_input)
if not skip_decoder_start_token:
dec_input_new["prompt_token_ids"] = _prepare_decoder_input_ids_for_generation(
dec_input_new["prompt_token_ids"],
decoder_start_token_id,
)
if cache_salt := enc_input.get("cache_salt"):
dec_input_new["cache_salt"] = cache_salt
return EncoderDecoderInput(
type="enc_dec",
encoder_prompt=enc_input_new,
decoder_prompt=dec_input_new,
)
def split_enc_dec_input(
inputs: EngineInput,
) -> tuple[SingletonInput | None, SingletonInput]:
if inputs["type"] == "enc_dec":
return inputs["encoder_prompt"], inputs["decoder_prompt"]
return None, inputs
"""Schema and utilities for input prompts to the LLM API."""
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, Literal, TypeAlias from collections.abc import Mapping, Sequence
from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, final
import torch from typing_extensions import NotRequired, TypedDict
from typing_extensions import NotRequired, TypedDict, assert_never
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.multimodal.inputs import ( import torch
MultiModalDataDict,
MultiModalEncDecInputs, from vllm.multimodal.inputs import AudioItem, ImageItem, VideoItem, VisionChunk
MultiModalInputs,
MultiModalUUIDDict,
) _T = TypeVar("_T")
else:
MultiModalDataDict = object ModalityData: TypeAlias = _T | list[_T | None] | None
MultiModalEncDecInputs = object """
MultiModalInputs = object Either a single data item, or a list of data items. Can only be None if UUID
MultiModalUUIDDict = object is provided.
The number of data items allowed per modality is restricted by
# Inputs to LLM API `--limit-mm-per-prompt`.
"""
@final
class MultiModalDataBuiltins(TypedDict, total=False):
"""Type annotations for modality types predefined by vLLM."""
image: ModalityData["ImageItem"]
"""The input image(s)."""
video: ModalityData["VideoItem"]
"""The input video(s)."""
audio: ModalityData["AudioItem"]
"""The input audio(s)."""
vision_chunk: ModalityData["VisionChunk"]
"""The input visual atom(s) - unified modality for images and video chunks."""
MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
"""
A dictionary containing an entry for each modality type to input.
The built-in modalities are defined by
[`MultiModalDataBuiltins`][vllm.inputs.llm.MultiModalDataBuiltins].
"""
MultiModalUUIDDict: TypeAlias = Mapping[str, Sequence[str | None] | str]
"""
A dictionary containing user-provided UUIDs for items in each modality.
If a UUID for an item is not provided, its entry will be `None` and
MultiModalHasher will compute a hash for the item.
The UUID will be used to identify the item for all caching purposes
(input processing caching, embedding caching, prefix caching, etc).
"""
class _PromptOptions(TypedDict): class _PromptOptions(TypedDict):
""" """
Additional options available to all Additional options available to all
[`SingletonPrompt`][vllm.inputs.data.SingletonPrompt]. [`SingletonPrompt`][vllm.inputs.llm.SingletonPrompt] types.
""" """
multi_modal_data: NotRequired[MultiModalDataDict | None] multi_modal_data: NotRequired[MultiModalDataDict | None]
...@@ -78,7 +119,7 @@ class TokensPrompt(_PromptOptions): ...@@ -78,7 +119,7 @@ class TokensPrompt(_PromptOptions):
class EmbedsPrompt(_PromptOptions): class EmbedsPrompt(_PromptOptions):
"""Schema for a prompt provided via token embeddings.""" """Schema for a prompt provided via token embeddings."""
prompt_embeds: torch.Tensor prompt_embeds: "torch.Tensor"
"""The embeddings of the prompt.""" """The embeddings of the prompt."""
prompt: NotRequired[str] prompt: NotRequired[str]
...@@ -91,10 +132,10 @@ DecoderOnlyPrompt: TypeAlias = ( ...@@ -91,10 +132,10 @@ DecoderOnlyPrompt: TypeAlias = (
""" """
Schema of a prompt for a decoder-only model: Schema of a prompt for a decoder-only model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt]) - A text prompt (string or [`TextPrompt`][vllm.inputs.llm.TextPrompt])
- A tokenized prompt (list of token IDs, or - A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt]) [`TokensPrompt`][vllm.inputs.llm.TokensPrompt])
- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt]) - An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.llm.EmbedsPrompt])
For encoder-decoder models, passing a singleton prompt is shorthand for passing For encoder-decoder models, passing a singleton prompt is shorthand for passing
`ExplicitEncoderDecoderPrompt(encoder_prompt=prompt, decoder_prompt=None)`. `ExplicitEncoderDecoderPrompt(encoder_prompt=prompt, decoder_prompt=None)`.
...@@ -105,9 +146,9 @@ EncoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt ...@@ -105,9 +146,9 @@ EncoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt
""" """
Schema of a prompt for the encoder part of a encoder-decoder model: Schema of a prompt for the encoder part of a encoder-decoder model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt]) - A text prompt (string or [`TextPrompt`][vllm.inputs.llm.TextPrompt])
- A tokenized prompt (list of token IDs, or - A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt]) [`TokensPrompt`][vllm.inputs.llm.TokensPrompt])
""" """
...@@ -115,9 +156,9 @@ DecoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt ...@@ -115,9 +156,9 @@ DecoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt
""" """
Schema of a prompt for the decoder part of an encoder-decoder model: Schema of a prompt for the decoder part of an encoder-decoder model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt]) - A text prompt (string or [`TextPrompt`][vllm.inputs.llm.TextPrompt])
- A tokenized prompt (list of token IDs, or - A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt]) [`TokensPrompt`][vllm.inputs.llm.TokensPrompt])
Note: Note:
Multi-modal inputs are not supported for decoder prompts. Multi-modal inputs are not supported for decoder prompts.
...@@ -156,7 +197,7 @@ SingletonPrompt: TypeAlias = DecoderOnlyPrompt | EncoderPrompt | DecoderPrompt ...@@ -156,7 +197,7 @@ SingletonPrompt: TypeAlias = DecoderOnlyPrompt | EncoderPrompt | DecoderPrompt
""" """
Schema for a single prompt. This is as opposed to a data structure Schema for a single prompt. This is as opposed to a data structure
which encapsulates multiple prompts, such as which encapsulates multiple prompts, such as
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]. [`ExplicitEncoderDecoderPrompt`][vllm.inputs.llm.ExplicitEncoderDecoderPrompt].
""" """
...@@ -171,7 +212,7 @@ This is the input format accepted by most [`LLM`][vllm.entrypoints.llm.LLM] APIs ...@@ -171,7 +212,7 @@ This is the input format accepted by most [`LLM`][vllm.entrypoints.llm.LLM] APIs
class DataPrompt(_PromptOptions): class DataPrompt(_PromptOptions):
""" """
Represents generic inputs that are converted to Represents generic inputs that are converted to
[`PromptType`][vllm.inputs.data.PromptType] by IO processor plugins. [`PromptType`][vllm.inputs.llm.PromptType] by IO processor plugins.
""" """
data: Any data: Any
...@@ -179,235 +220,3 @@ class DataPrompt(_PromptOptions): ...@@ -179,235 +220,3 @@ class DataPrompt(_PromptOptions):
data_format: str data_format: str
"""The input data format.""" """The input data format."""
# Outputs of processor
class _InputOptions(TypedDict):
"""
Additional options available to all input types.
"""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
cache_salt: NotRequired[str]
"""Optional cache salt to be used for prefix caching."""
class TokenInputs(_InputOptions):
"""Represents token-based inputs."""
type: Literal["token"]
"""The type of inputs."""
prompt_token_ids: list[int]
"""The token IDs of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def token_inputs(
prompt_token_ids: list[int],
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> TokenInputs:
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class EmbedsInputs(_InputOptions):
"""Represents embeddings-based inputs."""
type: Literal["embeds"]
"""The type of inputs."""
prompt_embeds: torch.Tensor
"""The embeddings of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def embeds_inputs(
prompt_embeds: torch.Tensor,
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> EmbedsInputs:
"""Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
values."""
inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
DecoderOnlyInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
"""
A processed prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for decoder-only models.
"""
EncoderInputs: TypeAlias = TokenInputs | MultiModalEncDecInputs
"""
A processed encoder prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
DecoderInputs: TypeAlias = TokenInputs | MultiModalInputs
"""
A processed decoder prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
class EncoderDecoderInputs(TypedDict):
"""
A processed pair of encoder and decoder singleton prompts.
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
type: Literal["enc_dec"]
encoder_prompt: EncoderInputs
"""The inputs for the encoder portion."""
decoder_prompt: DecoderInputs
"""The inputs for the decoder portion."""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
ProcessorInputs: TypeAlias = DecoderOnlyInputs | EncoderDecoderInputs
"""
A processed prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor].
"""
SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs
"""The inputs for a single encoder/decoder prompt."""
def _validate_enc_inputs(inputs: SingletonInputs) -> EncoderInputs:
if inputs["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
if inputs["type"] == "multimodal" and "encoder_prompt_token_ids" not in inputs:
raise RuntimeError(
"You should register an encoder-decoder multi-modal processor "
"for encoder-decoder models."
)
return inputs # type: ignore[return-value]
def _validate_dec_inputs(inputs: SingletonInputs) -> DecoderInputs:
if inputs["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
return inputs
def _prepare_decoder_input_ids_for_generation(
decoder_input_ids: list[int],
decoder_start_token_id: int,
) -> list[int]:
"""
Prepare `decoder_input_ids` for generation with encoder-decoder models,
according to `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
Source:
https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/generation/utils.py
"""
if len(decoder_input_ids) == 0 or decoder_input_ids[0] != decoder_start_token_id:
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
return decoder_input_ids
def build_enc_dec_inputs(
encoder_inputs: SingletonInputs,
decoder_inputs: SingletonInputs | None,
decoder_start_token_id: int,
skip_decoder_start_token: bool = False,
) -> EncoderDecoderInputs:
enc_inputs = _validate_enc_inputs(encoder_inputs)
if decoder_inputs is None:
dec_inputs: DecoderInputs = enc_inputs
else:
dec_inputs = _validate_dec_inputs(decoder_inputs)
enc_inputs_new: EncoderInputs
dec_inputs_new: DecoderInputs
if enc_inputs["type"] == "multimodal":
from vllm.multimodal.inputs import mm_inputs
enc_inputs_new = token_inputs(
enc_inputs["encoder_prompt_token_ids"],
prompt=enc_inputs.get("encoder_prompt"),
)
dec_inputs_new = mm_inputs(
prompt_token_ids=dec_inputs["prompt_token_ids"],
prompt=dec_inputs.get("prompt"),
mm_kwargs=enc_inputs["mm_kwargs"],
mm_hashes=enc_inputs["mm_hashes"],
mm_placeholders=enc_inputs["mm_placeholders"],
)
elif enc_inputs["type"] == "token":
enc_inputs_new = token_inputs(prompt_token_ids=[])
dec_inputs_new = dec_inputs
else:
assert_never(enc_inputs)
if not skip_decoder_start_token:
dec_inputs_new["prompt_token_ids"] = _prepare_decoder_input_ids_for_generation(
dec_inputs_new["prompt_token_ids"],
decoder_start_token_id,
)
if cache_salt := enc_inputs.get("cache_salt"):
dec_inputs_new["cache_salt"] = cache_salt
return EncoderDecoderInputs(
type="enc_dec",
encoder_prompt=enc_inputs_new,
decoder_prompt=dec_inputs_new,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .data import ProcessorInputs, SingletonInputs
def split_enc_dec_inputs(
inputs: ProcessorInputs,
) -> tuple[SingletonInputs | None, SingletonInputs]:
if inputs["type"] == "enc_dec":
return inputs["encoder_prompt"], inputs["decoder_prompt"]
return None, inputs
...@@ -7,14 +7,9 @@ from typing import Any, overload ...@@ -7,14 +7,9 @@ from typing import Any, overload
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.inputs.data import build_enc_dec_inputs from vllm.inputs import build_enc_dec_input
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalInputs,
MultiModalUUIDDict,
)
from vllm.renderers import BaseRenderer, renderer_from_config from vllm.renderers import BaseRenderer, renderer_from_config
from vllm.renderers.inputs import ( from vllm.renderers.inputs import (
DecoderDictPrompt, DecoderDictPrompt,
...@@ -26,20 +21,25 @@ from vllm.renderers.inputs import ( ...@@ -26,20 +21,25 @@ from vllm.renderers.inputs import (
from vllm.renderers.inputs.preprocess import parse_dec_only_prompt, parse_enc_dec_prompt from vllm.renderers.inputs.preprocess import parse_dec_only_prompt, parse_enc_dec_prompt
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from .data import ( from .engine import (
DecoderInputs, DecoderEngineInput,
DecoderOnlyInputs, DecoderOnlyEngineInput,
EmbedsInputs, EmbedsInput,
EncoderDecoderInput,
EncoderInput,
EngineInput,
MultiModalInput,
SingletonInput,
TokensInput,
tokens_input,
)
from .llm import (
EmbedsPrompt, EmbedsPrompt,
EncoderDecoderInputs, MultiModalDataDict,
EncoderInputs, MultiModalUUIDDict,
ProcessorInputs,
PromptType, PromptType,
SingletonInputs,
TextPrompt, TextPrompt,
TokenInputs,
TokensPrompt, TokensPrompt,
token_inputs,
) )
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -95,7 +95,7 @@ class InputPreprocessor: ...@@ -95,7 +95,7 @@ class InputPreprocessor:
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
*, *,
mm_uuids: MultiModalUUIDDict | None = None, mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs: ) -> MultiModalInput:
""" """
Apply the model's multi-modal processor to a multi-modal prompt, Apply the model's multi-modal processor to a multi-modal prompt,
returning the corresponding token IDs and metadata. returning the corresponding token IDs and metadata.
...@@ -111,7 +111,7 @@ class InputPreprocessor: ...@@ -111,7 +111,7 @@ class InputPreprocessor:
def _process_embeds( def _process_embeds(
self, self,
parsed_content: EmbedsPrompt, parsed_content: EmbedsPrompt,
) -> EmbedsInputs: ) -> EmbedsInput:
return self.renderer._process_embeds(parsed_content) return self.renderer._process_embeds(parsed_content)
def _truncate_inputs( def _truncate_inputs(
...@@ -134,12 +134,12 @@ class InputPreprocessor: ...@@ -134,12 +134,12 @@ class InputPreprocessor:
self, self,
parsed_content: TokensPrompt, parsed_content: TokensPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> TokenInputs | MultiModalInputs: ) -> TokensInput | MultiModalInput:
prompt_token_ids = self._truncate_inputs( prompt_token_ids = self._truncate_inputs(
parsed_content["prompt_token_ids"], tokenization_kwargs parsed_content["prompt_token_ids"], tokenization_kwargs
) )
inputs: TokenInputs | MultiModalInputs inputs: TokensInput | MultiModalInput
if multi_modal_data := parsed_content.get("multi_modal_data"): if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal( inputs = self._process_multimodal(
prompt_token_ids, prompt_token_ids,
...@@ -149,7 +149,7 @@ class InputPreprocessor: ...@@ -149,7 +149,7 @@ class InputPreprocessor:
mm_uuids=parsed_content.get("multi_modal_uuids"), mm_uuids=parsed_content.get("multi_modal_uuids"),
) )
else: else:
inputs = token_inputs(prompt_token_ids) inputs = tokens_input(prompt_token_ids)
if prompt_text := parsed_content.get("prompt"): if prompt_text := parsed_content.get("prompt"):
inputs["prompt"] = prompt_text inputs["prompt"] = prompt_text
...@@ -162,10 +162,10 @@ class InputPreprocessor: ...@@ -162,10 +162,10 @@ class InputPreprocessor:
self, self,
parsed_content: TextPrompt, parsed_content: TextPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> TokenInputs | MultiModalInputs: ) -> TokensInput | MultiModalInput:
prompt_text = parsed_content["prompt"] prompt_text = parsed_content["prompt"]
inputs: TokenInputs | MultiModalInputs inputs: TokensInput | MultiModalInput
if multi_modal_data := parsed_content.get("multi_modal_data"): if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal( inputs = self._process_multimodal(
prompt_text, prompt_text,
...@@ -178,7 +178,7 @@ class InputPreprocessor: ...@@ -178,7 +178,7 @@ class InputPreprocessor:
prompt_text, prompt_text,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
) )
inputs = token_inputs(prompt_token_ids) inputs = tokens_input(prompt_token_ids)
inputs["prompt"] = prompt_text inputs["prompt"] = prompt_text
...@@ -192,38 +192,27 @@ class InputPreprocessor: ...@@ -192,38 +192,27 @@ class InputPreprocessor:
self, self,
prompt: EncoderDictPrompt, prompt: EncoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> EncoderInputs: ... ) -> EncoderInput: ...
@overload @overload
def _prompt_to_llm_inputs( # type: ignore[misc] def _prompt_to_llm_inputs( # type: ignore[misc]
self, self,
prompt: DecoderDictPrompt, prompt: DecoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderInputs: ... ) -> DecoderEngineInput: ...
@overload @overload
def _prompt_to_llm_inputs( # type: ignore[misc] def _prompt_to_llm_inputs( # type: ignore[misc]
self, self,
prompt: DecoderOnlyDictPrompt, prompt: DecoderOnlyDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderOnlyInputs: ... ) -> DecoderOnlyEngineInput: ...
def _prompt_to_llm_inputs( def _prompt_to_llm_inputs(
self, self,
prompt: SingletonDictPrompt, prompt: SingletonDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> SingletonInputs: ) -> SingletonInput:
"""
Extract the singleton inputs from a prompt.
Arguments:
* prompt: single encoder or decoder input prompt
Returns:
* [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
"""
if "prompt_embeds" in prompt: if "prompt_embeds" in prompt:
return self._process_embeds(prompt) # type: ignore[arg-type] return self._process_embeds(prompt) # type: ignore[arg-type]
...@@ -242,22 +231,7 @@ class InputPreprocessor: ...@@ -242,22 +231,7 @@ class InputPreprocessor:
self, self,
prompt: EncoderDecoderDictPrompt, prompt: EncoderDecoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> EncoderDecoderInputs: ) -> EncoderDecoderInput:
"""
For encoder/decoder models only:
Process an input prompt into an
[`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance.
Arguments:
* prompt: an input prompt
Returns:
* [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance
"""
encoder_prompt = prompt["encoder_prompt"] encoder_prompt = prompt["encoder_prompt"]
decoder_prompt = prompt["decoder_prompt"] decoder_prompt = prompt["decoder_prompt"]
...@@ -270,12 +244,12 @@ class InputPreprocessor: ...@@ -270,12 +244,12 @@ class InputPreprocessor:
self.renderer.mm_processor.skip_decoder_start_token self.renderer.mm_processor.skip_decoder_start_token
) )
return build_enc_dec_inputs( return build_enc_dec_input(
encoder_inputs=self._prompt_to_llm_inputs( encoder_input=self._prompt_to_llm_inputs(
encoder_prompt, encoder_prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
), ),
decoder_inputs=( decoder_input=(
None None
if decoder_prompt is None if decoder_prompt is None
else self._prompt_to_llm_inputs( else self._prompt_to_llm_inputs(
...@@ -291,20 +265,7 @@ class InputPreprocessor: ...@@ -291,20 +265,7 @@ class InputPreprocessor:
self, self,
prompt: DecoderOnlyDictPrompt, prompt: DecoderOnlyDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderOnlyInputs: ) -> DecoderOnlyEngineInput:
"""
For decoder-only models:
Process an input prompt into a
[`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
Arguments:
* prompt: input prompt
Returns:
* [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
"""
return self._prompt_to_llm_inputs( return self._prompt_to_llm_inputs(
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
...@@ -314,7 +275,7 @@ class InputPreprocessor: ...@@ -314,7 +275,7 @@ class InputPreprocessor:
self, self,
prompt: PromptType, prompt: PromptType,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
) -> ProcessorInputs: ) -> EngineInput:
"""Preprocess the input prompt.""" """Preprocess the input prompt."""
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
# Encoder-decoder model requires special mapping of # Encoder-decoder model requires special mapping of
......
...@@ -12,6 +12,7 @@ from transformers.models.aria.processing_aria import AriaProcessor ...@@ -12,6 +12,7 @@ from transformers.models.aria.processing_aria import AriaProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_rank from vllm.distributed import get_tensor_model_parallel_rank
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe import SharedFusedMoE
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
...@@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import (
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -31,17 +31,16 @@ from transformers.models.qwen2_audio import Qwen2AudioEncoder ...@@ -31,17 +31,16 @@ from transformers.models.qwen2_audio import Qwen2AudioEncoder
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import ModalityData, MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
from vllm.multimodal.parse import ( from vllm.multimodal.parse import (
DictEmbeddingItems, DictEmbeddingItems,
ModalityData,
ModalityDataItems, ModalityDataItems,
MultiModalDataItems, MultiModalDataItems,
MultiModalDataParser, MultiModalDataParser,
......
...@@ -17,9 +17,9 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import ( ...@@ -17,9 +17,9 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -15,6 +15,7 @@ import torch.nn as nn ...@@ -15,6 +15,7 @@ import torch.nn as nn
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -24,7 +25,6 @@ from vllm.model_executor.layers.linear import ( ...@@ -24,7 +25,6 @@ from vllm.model_executor.layers.linear import (
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -9,8 +9,8 @@ from transformers.activations import GELUActivation ...@@ -9,8 +9,8 @@ from transformers.activations import GELUActivation
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict
from .llava_next import ( from .llava_next import (
LlavaDummyInputsBuilder, LlavaDummyInputsBuilder,
......
...@@ -15,11 +15,11 @@ from transformers import ( ...@@ -15,11 +15,11 @@ from transformers import (
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -19,6 +19,7 @@ from transformers import ( ...@@ -19,6 +19,7 @@ from transformers import (
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention
...@@ -43,7 +44,6 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -43,7 +44,6 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -17,6 +17,7 @@ from transformers import ( ...@@ -17,6 +17,7 @@ from transformers import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.distributed import divide, get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict, MultiModalInput
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.conv import Conv2dLayer
...@@ -32,9 +33,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -32,9 +33,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
from vllm.multimodal.parse import ( from vllm.multimodal.parse import (
...@@ -207,7 +206,7 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]): ...@@ -207,7 +206,7 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
self, self,
inputs: ProcessorInputs, inputs: ProcessorInputs,
timing_ctx: TimingContext, timing_ctx: TimingContext,
) -> MultiModalInputs: ) -> MultiModalInput:
if inputs.mm_data_items: if inputs.mm_data_items:
if isinstance(inputs.prompt, str): if isinstance(inputs.prompt, str):
if len(inputs.prompt) > 0: if len(inputs.prompt) > 0:
......
...@@ -19,6 +19,7 @@ from transformers.models.cohere2_vision.processing_cohere2_vision import ( ...@@ -19,6 +19,7 @@ from transformers.models.cohere2_vision.processing_cohere2_vision import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import MulAndSilu from vllm.model_executor.layers.activation import MulAndSilu
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear, MergedColumnParallelLinear,
...@@ -28,7 +29,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -28,7 +29,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import math import math
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Literal, cast from typing import Literal
import numpy as np import numpy as np
import torch import torch
...@@ -14,7 +14,7 @@ from transformers import PretrainedConfig ...@@ -14,7 +14,7 @@ from transformers import PretrainedConfig
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType from vllm.inputs import MultiModalDataDict, PromptType, TextPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import ( from vllm.model_executor.layers.attention import (
...@@ -32,7 +32,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead ...@@ -32,7 +32,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
...@@ -2047,14 +2046,11 @@ class CohereASRForConditionalGeneration( ...@@ -2047,14 +2046,11 @@ class CohereASRForConditionalGeneration(
f"<|noitn|><|notimestamp|><|nodiarize|>" f"<|noitn|><|notimestamp|><|nodiarize|>"
) )
prompt_text = request_prompt if request_prompt else default_prompt prompt_text = request_prompt if request_prompt else default_prompt
prompt = {
"prompt": prompt_text,
"multi_modal_data": {
"audio": (audio, stt_config.sample_rate),
},
}
return cast(PromptType, prompt) return TextPrompt(
prompt=prompt_text,
multi_modal_data={"audio": (audio, stt_config.sample_rate)},
)
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
......
...@@ -16,11 +16,11 @@ from transformers import BatchFeature ...@@ -16,11 +16,11 @@ from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -12,6 +12,7 @@ from transformers import BatchFeature, CLIPVisionConfig ...@@ -12,6 +12,7 @@ from transformers import BatchFeature, CLIPVisionConfig
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.models.interfaces import ( from vllm.model_executor.models.interfaces import (
MultiModalEmbeddings, MultiModalEmbeddings,
SupportsLoRA, SupportsLoRA,
...@@ -27,7 +28,6 @@ from vllm.model_executor.models.utils import ( ...@@ -27,7 +28,6 @@ from vllm.model_executor.models.utils import (
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
......
...@@ -12,6 +12,7 @@ from transformers import BatchFeature ...@@ -12,6 +12,7 @@ from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.models.interfaces import ( from vllm.model_executor.models.interfaces import (
MultiModalEmbeddings, MultiModalEmbeddings,
SupportsLoRA, SupportsLoRA,
...@@ -27,7 +28,6 @@ from vllm.model_executor.models.utils import ( ...@@ -27,7 +28,6 @@ from vllm.model_executor.models.utils import (
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
......
...@@ -17,11 +17,11 @@ from transformers import BatchFeature ...@@ -17,11 +17,11 @@ from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.transformers.utils import replace_linear_class from vllm.model_executor.models.transformers.utils import replace_linear_class
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -15,6 +15,7 @@ from vllm.distributed.parallel_state import ( ...@@ -15,6 +15,7 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
) )
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.attention import ( from vllm.model_executor.layers.attention import (
MMEncoderAttention, MMEncoderAttention,
...@@ -54,7 +55,6 @@ from vllm.model_executor.models.utils import ( ...@@ -54,7 +55,6 @@ from vllm.model_executor.models.utils import (
) )
from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.model_executor.models.vision import get_vit_attn_backend
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment