"vllm/vscode:/vscode.git/clone" did not exist on "b7036c87a13bd94fabf9e46436d3c1e67688f729"
Unverified Commit d6484ef3 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Add full API docs and improve the UX of navigating them (#17485)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 46fae69c
...@@ -235,7 +235,7 @@ class Sampler(nn.Module): ...@@ -235,7 +235,7 @@ class Sampler(nn.Module):
* Defer Pythonization of sampling result & logprobs * Defer Pythonization of sampling result & logprobs
tensor tensor
* Encapsulate arguments required for deferred Pythonization * Encapsulate arguments required for deferred Pythonization
in the :class:`SamplerOutput` structure in the {class}`SamplerOutput` structure
Args: Args:
logits: (num_tokens, vocab_size). logits: (num_tokens, vocab_size).
......
...@@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): ...@@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
A draft token_id x_{n+k} is accepted if it satisfies the A draft token_id x_{n+k} is accepted if it satisfies the
following condition following condition
.. math:: :::{math}
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
\min \left( \epsilon, \delta * \exp \left( \min \left( \epsilon, \delta * \exp \left(
-H(p_{\text{original}}( -H(p_{\text{original}}(
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
:::
where :math:`p_{\text{original}}` corresponds to target_probs where {math}`p_{\text{original}}` corresponds to target_probs
and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
specified using self._posterior_threshold and self._posterior_alpha specified using self._posterior_threshold and self._posterior_alpha
This method computes the posterior probabilities for the given This method computes the posterior probabilities for the given
......
...@@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
batch. batch.
pixel_values: The pixels in each input image. pixel_values: The pixels in each input image.
See also: :::{seealso}
:class:`Blip2ImageInputs` {class}`Blip2ImageInputs`
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
......
...@@ -226,9 +226,9 @@ class SupportsPP(Protocol): ...@@ -226,9 +226,9 @@ class SupportsPP(Protocol):
intermediate_tensors: Optional["IntermediateTensors"], intermediate_tensors: Optional["IntermediateTensors"],
) -> Union[Tensor, "IntermediateTensors"]: ) -> Union[Tensor, "IntermediateTensors"]:
""" """
Accept :class:`IntermediateTensors` when PP rank > 0. Accept {class}`IntermediateTensors` when PP rank > 0.
Return :class:`IntermediateTensors` only for the last PP rank. Return {class}`IntermediateTensors` only for the last PP rank.
""" """
... ...
......
...@@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
batch. batch.
pixel_values: The pixels in each input image. pixel_values: The pixels in each input image.
See also: :::{seealso}
:class:`LlavaImageInputs` {class}`LlavaImageInputs`
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
Unlike in LLaVA-1.5, the number of image tokens inputted to the language Unlike in LLaVA-1.5, the number of image tokens inputted to the language
model depends on the original size of the input image. Including the model depends on the original size of the input image. Including the
original image token in the input, the required number of image tokens original image token in the input, the required number of image tokens
is given by :func:`get_llava_next_image_feature_size`. is given by {func}`get_llava_next_image_feature_size`.
This way, the `positions` and `attn_metadata` are consistent This way, the `positions` and `attn_metadata` are consistent
with the `input_ids`. with the `input_ids`.
...@@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values: The pixels in each grid patch for each input image. pixel_values: The pixels in each grid patch for each input image.
image_sizes: The original `(height, width)` for each input image. image_sizes: The original `(height, width)` for each input image.
See also: :::{seealso}
:class:`LlavaNextImageInputs` {class}`LlavaNextImageInputs`
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, ...@@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
batch. batch.
pixel_values: The pixels in each input image. pixel_values: The pixels in each input image.
See also: :::{seealso}
:class:`Mistral3ImagePixelInputs` {class}`Mistral3ImagePixelInputs`
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -965,7 +965,7 @@ def select_tiling( ...@@ -965,7 +965,7 @@ def select_tiling(
class MolmoProcessorWrapper: class MolmoProcessorWrapper:
""" """
Wraps :class:`MolmoProcessor` so that it can be called directly. Wraps {class}`MolmoProcessor` so that it can be called directly.
The original definition can be found here: The original definition can be found here:
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
......
...@@ -12,7 +12,7 @@ import torch.nn.functional as F ...@@ -12,7 +12,7 @@ import torch.nn.functional as F
from torch import Tensor, nn from torch import Tensor, nn
class Block(nn.Module): class BlockBase(nn.Module):
"""Block abstract module""" """Block abstract module"""
def __init__(self, input_size, output_size): def __init__(self, input_size, output_size):
...@@ -1602,7 +1602,7 @@ class AttModule(nn.Module): ...@@ -1602,7 +1602,7 @@ class AttModule(nn.Module):
return x, memory, pos_emb, att_mask return x, memory, pos_emb, att_mask
class AttBlock(Block, AttModule): class AttBlock(BlockBase, AttModule):
"""Attention Block module to support both Attention and Block module.""" """Attention Block module to support both Attention and Block module."""
def memory_dims(self, max_len=False): def memory_dims(self, max_len=False):
......
...@@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict): ...@@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict):
""" """
Shape: `(batch_size * num_images, num_channels, image_width, image_height)` Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
The result of stacking :attr:`ImageEncoding.tokens` from each prompt. The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
""" """
class PixtralProcessorAdapter: class PixtralProcessorAdapter:
""" """
Provide a HF-compatible interface for Provide a HF-compatible interface for
:class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
""" """
def __init__(self, tokenizer: MistralTokenizer) -> None: def __init__(self, tokenizer: MistralTokenizer) -> None:
......
...@@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad( ...@@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
""" """
The logic of adding image pad tokens should only be applied in The logic of adding image pad tokens should only be applied in
:class:`QwenVLProcessor`, so they are patched out here. {class}`QwenVLProcessor`, so they are patched out here.
The definition of the wrapped tokenizer can be found here: The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
......
...@@ -19,7 +19,6 @@ import cloudpickle ...@@ -19,7 +19,6 @@ import cloudpickle
import torch.nn as nn import torch.nn as nn
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import is_in_doc_build
from .interfaces import (has_inner_state, has_noops, is_attention_free, from .interfaces import (has_inner_state, has_noops, is_attention_free,
is_hybrid, supports_cross_encoding, is_hybrid, supports_cross_encoding,
...@@ -375,13 +374,13 @@ class _ModelRegistry: ...@@ -375,13 +374,13 @@ class _ModelRegistry:
""" """
Register an external model to be used in vLLM. Register an external model to be used in vLLM.
:code:`model_cls` can be either: `model_cls` can be either:
- A :class:`torch.nn.Module` class directly referencing the model. - A {class}`torch.nn.Module` class directly referencing the model.
- A string in the format :code:`<module>:<class>` which can be used to - A string in the format `<module>:<class>` which can be used to
lazily import the model. This is useful to avoid initializing CUDA lazily import the model. This is useful to avoid initializing CUDA
when importing the model and thus the related error when importing the model and thus the related error
:code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`. `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
""" """
if not isinstance(model_arch, str): if not isinstance(model_arch, str):
msg = f"`model_arch` should be a string, not a {type(model_arch)}" msg = f"`model_arch` should be a string, not a {type(model_arch)}"
...@@ -400,8 +399,7 @@ class _ModelRegistry: ...@@ -400,8 +399,7 @@ class _ModelRegistry:
raise ValueError(msg) raise ValueError(msg)
model = _LazyRegisteredModel(*split_str) model = _LazyRegisteredModel(*split_str)
elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass( elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
model_cls, nn.Module)):
model = _RegisteredModel.from_model_cls(model_cls) model = _RegisteredModel.from_model_cls(model_cls)
else: else:
msg = ("`model_cls` should be a string or PyTorch model class, " msg = ("`model_cls` should be a string or PyTorch model class, "
......
...@@ -66,7 +66,7 @@ class WeightsMapper: ...@@ -66,7 +66,7 @@ class WeightsMapper:
class AutoWeightsLoader: class AutoWeightsLoader:
""" """
Helper class to load weights into a :class:`torch.nn.Module`. It is able Helper class to load weights into a {class}`torch.nn.Module`. It is able
to automatically detect child modules and parameters while iterating over to automatically detect child modules and parameters while iterating over
the weights only once. the weights only once.
......
...@@ -8,11 +8,12 @@ from .registry import MultiModalRegistry ...@@ -8,11 +8,12 @@ from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry() MULTIMODAL_REGISTRY = MultiModalRegistry()
""" """
The global :class:`~MultiModalRegistry` is used by model runners to The global {class}`~MultiModalRegistry` is used by model runners to
dispatch data processing according to the target model. dispatch data processing according to the target model.
See also: :::{seealso}
:ref:`mm-processing` {ref}`mm-processing`
:::
""" """
__all__ = [ __all__ = [
......
...@@ -64,8 +64,7 @@ class MultiModalPlaceholderMap: ...@@ -64,8 +64,7 @@ class MultiModalPlaceholderMap:
Examples: Examples:
.. code-block:: ```
Prompt: |AAAA BBBB What's in these images?| Prompt: |AAAA BBBB What's in these images?|
Positions: |.................................| Positions: |.................................|
...@@ -93,6 +92,7 @@ class MultiModalPlaceholderMap: ...@@ -93,6 +92,7 @@ class MultiModalPlaceholderMap:
images = [] images = []
src_ranges = [] src_ranges = []
dest_ranges = [] dest_ranges = []
```
""" """
seq_mm_data = seq_group.multi_modal_data seq_mm_data = seq_group.multi_modal_data
seq_mm_placeholders = seq_group.multi_modal_placeholders seq_mm_placeholders = seq_group.multi_modal_placeholders
......
...@@ -26,27 +26,27 @@ _T = TypeVar("_T") ...@@ -26,27 +26,27 @@ _T = TypeVar("_T")
HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
""" """
A :class:`transformers.image_utils.ImageInput` representing a single image A {class}`transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace :code:`ImageProcessor`. item, which can be passed to a HuggingFace `ImageProcessor`.
""" """
HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
list[np.ndarray], list[torch.Tensor]] list[np.ndarray], list[torch.Tensor]]
""" """
A :class:`transformers.image_utils.VideoInput` representing a single video A {class}`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace :code:`VideoProcessor`. item, which can be passed to a HuggingFace `VideoProcessor`.
""" """
HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
""" """
Represents a single audio Represents a single audio
item, which can be passed to a HuggingFace :code:`AudioProcessor`. item, which can be passed to a HuggingFace `AudioProcessor`.
""" """
ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
""" """
A :class:`transformers.image_utils.ImageInput` representing a single image A {class}`transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace :code:`ImageProcessor`. item, which can be passed to a HuggingFace `ImageProcessor`.
Alternatively, a 3-D tensor or batch of 2-D tensors, Alternatively, a 3-D tensor or batch of 2-D tensors,
which are treated as image embeddings; which are treated as image embeddings;
...@@ -55,8 +55,8 @@ these are directly passed to the model without HF processing. ...@@ -55,8 +55,8 @@ these are directly passed to the model without HF processing.
VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
""" """
A :class:`transformers.image_utils.VideoInput` representing a single video A {class}`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace :code:`VideoProcessor`. item, which can be passed to a HuggingFace `VideoProcessor`.
Alternatively, a 3-D tensor or batch of 2-D tensors, Alternatively, a 3-D tensor or batch of 2-D tensors,
which are treated as video embeddings; which are treated as video embeddings;
...@@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], ...@@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
torch.Tensor] torch.Tensor]
""" """
Represents a single audio Represents a single audio
item, which can be passed to a HuggingFace :code:`AudioProcessor`. item, which can be passed to a HuggingFace `AudioProcessor`.
Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
is different from that expected by the model; is different from that expected by the model;
...@@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]] ...@@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]]
Either a single data item, or a list of data items. Either a single data item, or a list of data items.
The number of data items allowed per modality is restricted by The number of data items allowed per modality is restricted by
:code:`--limit-mm-per-prompt`. `--limit-mm-per-prompt`.
""" """
...@@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] ...@@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
""" """
A dictionary containing an entry for each modality type to input. A dictionary containing an entry for each modality type to input.
The built-in modalities are defined by :class:`MultiModalDataBuiltins`. The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
""" """
...@@ -116,14 +116,14 @@ class PlaceholderRange: ...@@ -116,14 +116,14 @@ class PlaceholderRange:
Example: Example:
Prompt: :code:`AAAA BBBB What is in these images?` Prompt: `AAAA BBBB What is in these images?`
Images A and B will have: Images A and B will have:
.. code-block:: ```
A: PlaceholderRange(offset=0, length=4) A: PlaceholderRange(offset=0, length=4)
B: PlaceholderRange(offset=5, length=4) B: PlaceholderRange(offset=5, length=4)
```
""" """
offset: int offset: int
...@@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match. ...@@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
"""Equality check between :data:`NestedTensors` objects.""" """Equality check between {data}`NestedTensors` objects."""
if isinstance(a, torch.Tensor): if isinstance(a, torch.Tensor):
return isinstance(b, torch.Tensor) and torch.equal(a, b) return isinstance(b, torch.Tensor) and torch.equal(a, b)
elif isinstance(b, torch.Tensor): elif isinstance(b, torch.Tensor):
...@@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: ...@@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
""" """
A dictionary containing nested tensors which have been batched via A dictionary containing nested tensors which have been batched via
:meth:`MultiModalKwargs.batch`. {meth}`MultiModalKwargs.batch`.
""" """
...@@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via ...@@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via
class MultiModalFieldElem: class MultiModalFieldElem:
""" """
Represents a keyword argument corresponding to a multi-modal item Represents a keyword argument corresponding to a multi-modal item
in :class:`MultiModalKwargs`. in {class}`MultiModalKwargs`.
""" """
modality: str modality: str
...@@ -205,13 +205,13 @@ class MultiModalFieldElem: ...@@ -205,13 +205,13 @@ class MultiModalFieldElem:
key: str key: str
""" """
The key of this field in :class:`MultiModalKwargs`, The key of this field in {class}`MultiModalKwargs`,
i.e. the name of the keyword argument to be passed to the model. i.e. the name of the keyword argument to be passed to the model.
""" """
data: NestedTensors data: NestedTensors
""" """
The tensor data of this field in :class:`MultiModalKwargs`, The tensor data of this field in {class}`MultiModalKwargs`,
i.e. the value of the keyword argument to be passed to the model. i.e. the value of the keyword argument to be passed to the model.
""" """
...@@ -234,7 +234,7 @@ class MultiModalFieldElem: ...@@ -234,7 +234,7 @@ class MultiModalFieldElem:
class BaseMultiModalField(ABC): class BaseMultiModalField(ABC):
""" """
Defines how to interpret tensor data belonging to a keyword argument in Defines how to interpret tensor data belonging to a keyword argument in
:class:`MultiModalKwargs` for multiple multi-modal items, and vice versa. {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
""" """
def _field_factory(self, *, modality: str, key: str): def _field_factory(self, *, modality: str, key: str):
...@@ -259,10 +259,10 @@ class BaseMultiModalField(ABC): ...@@ -259,10 +259,10 @@ class BaseMultiModalField(ABC):
data: NestedTensors, data: NestedTensors,
) -> Sequence[MultiModalFieldElem]: ) -> Sequence[MultiModalFieldElem]:
""" """
Construct :class:`MultiModalFieldElem` instances to represent Construct {class}`MultiModalFieldElem` instances to represent
the provided data. the provided data.
This is the inverse of :meth:`reduce_data`. This is the inverse of {meth}`reduce_data`.
""" """
raise NotImplementedError raise NotImplementedError
...@@ -272,9 +272,9 @@ class BaseMultiModalField(ABC): ...@@ -272,9 +272,9 @@ class BaseMultiModalField(ABC):
def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
""" """
Merge the data from multiple instances of :class:`MultiModalFieldElem`. Merge the data from multiple instances of {class}`MultiModalFieldElem`.
This is the inverse of :meth:`build_elems`. This is the inverse of {meth}`build_elems`.
""" """
field_types = [type(item.field) for item in elems] field_types = [type(item.field) for item in elems]
if len(set(field_types)) > 1: if len(set(field_types)) > 1:
...@@ -286,8 +286,9 @@ class BaseMultiModalField(ABC): ...@@ -286,8 +286,9 @@ class BaseMultiModalField(ABC):
@dataclass(frozen=True) @dataclass(frozen=True)
class MultiModalBatchedField(BaseMultiModalField): class MultiModalBatchedField(BaseMultiModalField):
""" """
See also: :::{seealso}
:func:`MultiModalFieldConfig.batched` {func}`MultiModalFieldConfig.batched`
:::
""" """
def build_elems( def build_elems(
...@@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField): ...@@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField):
@dataclass(frozen=True) @dataclass(frozen=True)
class MultiModalFlatField(BaseMultiModalField): class MultiModalFlatField(BaseMultiModalField):
""" """
See also: :::{seealso}
:func:`MultiModalFieldConfig.flat` {func}`MultiModalFieldConfig.flat`
:func:`MultiModalFieldConfig.flat_from_sizes` {func}`MultiModalFieldConfig.flat_from_sizes`
:::
""" """
slices: Union[Sequence[slice], Sequence[Sequence[slice]]] slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
dim: int = 0 dim: int = 0
...@@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField): ...@@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField):
@dataclass(frozen=True) @dataclass(frozen=True)
class MultiModalSharedField(BaseMultiModalField): class MultiModalSharedField(BaseMultiModalField):
""" """
See also: :::{seealso}
:func:`MultiModalFieldConfig.shared` {func}`MultiModalFieldConfig.shared`
:::
""" """
batch_size: int batch_size: int
...@@ -390,8 +393,7 @@ class MultiModalFieldConfig: ...@@ -390,8 +393,7 @@ class MultiModalFieldConfig:
Example: Example:
.. code-block:: ```
Input: Input:
Data: [[AAAA] Data: [[AAAA]
[BBBB] [BBBB]
...@@ -401,6 +403,7 @@ class MultiModalFieldConfig: ...@@ -401,6 +403,7 @@ class MultiModalFieldConfig:
Element 1: [AAAA] Element 1: [AAAA]
Element 2: [BBBB] Element 2: [BBBB]
Element 3: [CCCC] Element 3: [CCCC]
```
""" """
return MultiModalFieldConfig( return MultiModalFieldConfig(
field=MultiModalBatchedField(), field=MultiModalBatchedField(),
...@@ -425,8 +428,7 @@ class MultiModalFieldConfig: ...@@ -425,8 +428,7 @@ class MultiModalFieldConfig:
Example: Example:
.. code-block:: ```
Given: Given:
slices: [slice(0, 3), slice(3, 7), slice(7, 9)] slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
...@@ -437,9 +439,9 @@ class MultiModalFieldConfig: ...@@ -437,9 +439,9 @@ class MultiModalFieldConfig:
Element 1: [AAA] Element 1: [AAA]
Element 2: [BBBB] Element 2: [BBBB]
Element 3: [CC] Element 3: [CC]
```
.. code-block:: ```
Given: Given:
slices: [ slices: [
(slice(None), slice(0, 3)), (slice(None), slice(0, 3)),
...@@ -454,6 +456,7 @@ class MultiModalFieldConfig: ...@@ -454,6 +456,7 @@ class MultiModalFieldConfig:
Element 1: [[A],[A],[A]] Element 1: [[A],[A],[A]]
Element 2: [[B],[B],[B],[B]] Element 2: [[B],[B],[B],[B]]
Element 3: [[C],[C]] Element 3: [[C],[C]]
```
""" """
return MultiModalFieldConfig( return MultiModalFieldConfig(
field=MultiModalFlatField(slices=slices, dim=dim), field=MultiModalFlatField(slices=slices, dim=dim),
...@@ -477,8 +480,7 @@ class MultiModalFieldConfig: ...@@ -477,8 +480,7 @@ class MultiModalFieldConfig:
Example: Example:
.. code-block:: ```
Given: Given:
size_per_item: [3, 4, 2] size_per_item: [3, 4, 2]
...@@ -489,10 +491,9 @@ class MultiModalFieldConfig: ...@@ -489,10 +491,9 @@ class MultiModalFieldConfig:
Element 1: [AAA] Element 1: [AAA]
Element 2: [BBBB] Element 2: [BBBB]
Element 3: [CC] Element 3: [CC]
```
```
.. code-block::
Given: Given:
slices: [3, 4, 2] slices: [3, 4, 2]
dim: 1 dim: 1
...@@ -504,9 +505,11 @@ class MultiModalFieldConfig: ...@@ -504,9 +505,11 @@ class MultiModalFieldConfig:
Element 1: [[A],[A],[A]] Element 1: [[A],[A],[A]]
Element 2: [[B],[B],[B],[B]] Element 2: [[B],[B],[B],[B]]
Element 3: [[C],[C]] Element 3: [[C],[C]]
```
See also: :::{seealso}
:func:`MultiModalFieldConfig.flat` {func}`MultiModalFieldConfig.flat`
:::
""" """
if size_per_item.ndim != 1: if size_per_item.ndim != 1:
...@@ -535,8 +538,7 @@ class MultiModalFieldConfig: ...@@ -535,8 +538,7 @@ class MultiModalFieldConfig:
Example: Example:
.. code-block:: ```
Given: Given:
batch_size: 4 batch_size: 4
...@@ -548,6 +550,7 @@ class MultiModalFieldConfig: ...@@ -548,6 +550,7 @@ class MultiModalFieldConfig:
Element 2: [XYZ] Element 2: [XYZ]
Element 3: [XYZ] Element 3: [XYZ]
Element 4: [XYZ] Element 4: [XYZ]
```
""" """
return MultiModalFieldConfig( return MultiModalFieldConfig(
field=MultiModalSharedField(batch_size), field=MultiModalSharedField(batch_size),
...@@ -570,8 +573,8 @@ class MultiModalFieldConfig: ...@@ -570,8 +573,8 @@ class MultiModalFieldConfig:
class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
""" """
A collection of :class:`MultiModalFieldElem` A collection of {class}`MultiModalFieldElem`
corresponding to a data item in :class:`MultiModalDataItems`. corresponding to a data item in {class}`MultiModalDataItems`.
""" """
@staticmethod @staticmethod
...@@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): ...@@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
class MultiModalKwargs(UserDict[str, NestedTensors]): class MultiModalKwargs(UserDict[str, NestedTensors]):
""" """
A dictionary that represents the keyword arguments to A dictionary that represents the keyword arguments to
:meth:`~torch.nn.Module.forward`. {meth}`~torch.nn.Module.forward`.
The metadata :code:`items` enables us to obtain the keyword arguments The metadata `items` enables us to obtain the keyword arguments
corresponding to each data item in :class:`MultiModalDataItems`, via corresponding to each data item in {class}`MultiModalDataItems`, via
:meth:`get_item` and :meth:`get_items`. {meth}`get_item` and {meth}`get_items`.
""" """
@staticmethod @staticmethod
...@@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): ...@@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
@staticmethod @staticmethod
def from_items(items: Sequence[MultiModalKwargsItem]): def from_items(items: Sequence[MultiModalKwargsItem]):
"""Construct a new :class:`MultiModalKwargs` from multiple items.""" """Construct a new {class}`MultiModalKwargs` from multiple items."""
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
for item in items: for item in items:
for key, elem in item.items(): for key, elem in item.items():
...@@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality. ...@@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality.
class MultiModalInputs(TypedDict): class MultiModalInputs(TypedDict):
""" """
Represents the outputs of Represents the outputs of
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`, {class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
ready to be passed to vLLM internals. ready to be passed to vLLM internals.
""" """
...@@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict): ...@@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict):
mm_placeholders: MultiModalPlaceholderDict mm_placeholders: MultiModalPlaceholderDict
""" """
For each modality, information about the placeholder tokens in For each modality, information about the placeholder tokens in
:code:`prompt_token_ids`. `prompt_token_ids`.
""" """
cache_salt: NotRequired[str] cache_salt: NotRequired[str]
...@@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict): ...@@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict):
class MultiModalEncDecInputs(MultiModalInputs): class MultiModalEncDecInputs(MultiModalInputs):
""" """
Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor` Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
ready to be passed to vLLM internals. ready to be passed to vLLM internals.
""" """
......
...@@ -25,7 +25,7 @@ _I = TypeVar("_I") ...@@ -25,7 +25,7 @@ _I = TypeVar("_I")
class ModalityDataItems(ABC, Generic[_T, _I]): class ModalityDataItems(ABC, Generic[_T, _I]):
""" """
Represents data items for a modality in :class:`MultiModalDataItems`. Represents data items for a modality in {class}`MultiModalDataItems`.
""" """
def __init__(self, data: _T, modality: str) -> None: def __init__(self, data: _T, modality: str) -> None:
...@@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) ...@@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
""" """
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
such that each entry corresponds to a list. such that each entry corresponds to a list.
""" """
...@@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): ...@@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
""" """
Get the number of data items belonging to a modality. Get the number of data items belonging to a modality.
If `strict=False`, return `0` instead of raising :exc:`KeyError` If `strict=False`, return `0` instead of raising {exc}`KeyError`
even if the modality is not found. even if the modality is not found.
""" """
if modality not in self: if modality not in self:
...@@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], ...@@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
class MultiModalDataParser: class MultiModalDataParser:
""" """
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
:class:`MultiModalDataItems`. {class}`MultiModalDataItems`.
Args: Args:
target_sr (float, optional): Enables automatic resampling of audio target_sr (float, optional): Enables automatic resampling of audio
......
...@@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]): ...@@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]):
is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
""" """
Given :attr:`full`, return a boolean mask of shape `(len(full),)` Given {attr}`full`, return a boolean mask of shape `(len(full),)`
indicating which positions of `full` to assign embeddings to. indicating which positions of `full` to assign embeddings to.
`None` (default) means to assign embeddings to all positions of `full`. `None` (default) means to assign embeddings to all positions of `full`.
The embeddings are obtained by calling The embeddings are obtained by calling
:class:`SupportsMultiModal.get_multimodal_embeddings`. {class}`SupportsMultiModal.get_multimodal_embeddings`.
""" """
@staticmethod @staticmethod
...@@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails] ...@@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
The token sequence or text that are part of the update. The token sequence or text that are part of the update.
If only part of the content corresponds to feature placeholders, you can If only part of the content corresponds to feature placeholders, you can
use :class:`PromptUpdateDetails` to specify which part. use {class}`PromptUpdateDetails` to specify which part.
""" """
PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
PromptUpdateInfo] PromptUpdateInfo]
""" """
Given the index of the processed item within :attr:`modality`, Given the index of the processed item within {attr}`modality`,
output the corresponding token sequence (or text). output the corresponding token sequence (or text).
For convenience, you can directly pass in the token sequence (or text) For convenience, you can directly pass in the token sequence (or text)
...@@ -216,49 +216,49 @@ class PromptInsertion(PromptUpdate): ...@@ -216,49 +216,49 @@ class PromptInsertion(PromptUpdate):
For each image, insert a number of ``<image>`` feature placeholders For each image, insert a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder after the ``<s>`` token: equal to the feature size of the vision encoder after the ``<s>`` token:
.. code-block:: python ```python
PromptInsertion( PromptInsertion(
modality="image", modality="image",
target="<s>", target="<s>",
insertion="<image>" * image_feature_size, insertion="<image>" * image_feature_size,
) )
```
Insert these tokens at the start of the prompt: Insert these tokens at the start of the prompt:
.. code-block:: python ```python
PromptInsertion( PromptInsertion(
modality="image", modality="image",
target=PromptIndexTargets.start(), target=PromptIndexTargets.start(),
insertion="<image>" * image_feature_size, insertion="<image>" * image_feature_size,
) )
```
Insert these tokens after a prefix ``Images:``: Insert these tokens after a prefix ``Images:``:
.. code-block:: python ```python
PromptInsertion( PromptInsertion(
modality="image", modality="image",
target=PromptIndexTargets.prefix("Images:"), target=PromptIndexTargets.prefix("Images:"),
insertion="<image>" * image_feature_size, insertion="<image>" * image_feature_size,
) )
```
Insert these tokens at the end of the prompt: Insert these tokens at the end of the prompt:
.. code-block:: python ```python
PromptInsertion( PromptInsertion(
modality="image", modality="image",
target=PromptIndexTargets.end(), target=PromptIndexTargets.end(),
insertion="<image>" * image_feature_size, insertion="<image>" * image_feature_size,
) )
```
""" """
insertion: PromptUpdateContent = field(repr=False) insertion: PromptUpdateContent = field(repr=False)
""" """
Given the index of the processed item within :attr:`modality`, Given the index of the processed item within {attr}`modality`,
output the token sequence (or text) to insert right after :attr:`target`. output the token sequence (or text) to insert right after {attr}`target`.
For convenience, you can directly pass in the token sequence (or text) For convenience, you can directly pass in the token sequence (or text)
instead of a function if it does not depend on the input. instead of a function if it does not depend on the input.
...@@ -284,20 +284,19 @@ class PromptReplacement(PromptUpdate): ...@@ -284,20 +284,19 @@ class PromptReplacement(PromptUpdate):
with a number of ``<image>`` feature placeholders with a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder: equal to the feature size of the vision encoder:
.. code-block:: python ```python
PromptReplacement( PromptReplacement(
modality="image", modality="image",
target="<image>", target="<image>",
replacement="<image>" * image_feature_size, replacement="<image>" * image_feature_size,
) )
```
As above, but further pad the feature placeholders with ``<image_bos>`` As above, but further pad the feature placeholders with ``<image_bos>``
and `<image_eos>``, which are not supposed to be passed to the vision and `<image_eos>``, which are not supposed to be passed to the vision
encoder: encoder:
.. code-block:: python ```python
PromptReplacement( PromptReplacement(
modality="image", modality="image",
target="<image>", target="<image>",
...@@ -310,12 +309,12 @@ class PromptReplacement(PromptUpdate): ...@@ -310,12 +309,12 @@ class PromptReplacement(PromptUpdate):
features="<image>" * image_feature_size, features="<image>" * image_feature_size,
), ),
) )
```
To avoid unnecessary tokenization during prompt replacement, To avoid unnecessary tokenization during prompt replacement,
we recommended passing token sequences instead of text: we recommended passing token sequences instead of text:
.. code-block:: python ```python
PromptReplacement( PromptReplacement(
modality="image", modality="image",
target=[image_token_id], target=[image_token_id],
...@@ -325,12 +324,13 @@ class PromptReplacement(PromptUpdate): ...@@ -325,12 +324,13 @@ class PromptReplacement(PromptUpdate):
features=[image_token_id] * image_feature_size, features=[image_token_id] * image_feature_size,
), ),
) )
```
""" """
replacement: PromptUpdateContent = field(repr=False) replacement: PromptUpdateContent = field(repr=False)
""" """
Given the index of the processed item within :attr:`modality`, Given the index of the processed item within {attr}`modality`,
output the token sequence (or text) to replace :attr:`target`. output the token sequence (or text) to replace {attr}`target`.
For convenience, you can directly pass in the token sequence (or text) For convenience, you can directly pass in the token sequence (or text)
instead of a function if it does not depend on the input. instead of a function if it does not depend on the input.
...@@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp]) ...@@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
"""Convenience function to apply :func:`full_groupby` based on modality.""" """Convenience function to apply {func}`full_groupby` based on modality."""
return full_groupby(values, key=lambda x: x.modality) return full_groupby(values, key=lambda x: x.modality)
@dataclass @dataclass
class _BoundPromptSequence: class _BoundPromptSequence:
""" """
A :data:`_PromptSeq` bound to a tokenizer to automatically A {data}`_PromptSeq` bound to a tokenizer to automatically
convert between token sequence and text representations. convert between token sequence and text representations.
""" """
tokenizer: AnyTokenizer = field(repr=False) tokenizer: AnyTokenizer = field(repr=False)
...@@ -443,8 +443,8 @@ class _BoundPromptContent: ...@@ -443,8 +443,8 @@ class _BoundPromptContent:
@dataclass @dataclass
class BoundPromptUpdate: class BoundPromptUpdate:
""" """
A :class:`PromptUpdate` bound to a tokenizer to automatically convert A {class}`PromptUpdate` bound to a tokenizer to automatically convert
:attr:`target` and the result of :meth:`get_content` between {attr}`target` and the result of {meth}`get_content` between
token sequence and text representations. token sequence and text representations.
""" """
_origin: PromptUpdate _origin: PromptUpdate
...@@ -479,7 +479,7 @@ class BoundPromptUpdate: ...@@ -479,7 +479,7 @@ class BoundPromptUpdate:
def get_content(self, item_idx: int) -> _BoundPromptContent: def get_content(self, item_idx: int) -> _BoundPromptContent:
""" """
Given the index of the processed item within :attr:`modality`, Given the index of the processed item within {attr}`modality`,
output the token sequence (or text) to update. output the token sequence (or text) to update.
""" """
content = self.content content = self.content
...@@ -516,7 +516,7 @@ def iter_token_matches( ...@@ -516,7 +516,7 @@ def iter_token_matches(
match_ids: list[int], match_ids: list[int],
) -> Generator[_TokenMatch]: ) -> Generator[_TokenMatch]:
""" """
Yield each occurrence of :code:`match_ids` in :code:`token_ids`. Yield each occurrence of `match_ids` in `token_ids`.
Note that empty matches are ignored. Note that empty matches are ignored.
""" """
...@@ -545,8 +545,8 @@ def replace_token_matches( ...@@ -545,8 +545,8 @@ def replace_token_matches(
new_ids: list[int], new_ids: list[int],
) -> list[int]: ) -> list[int]:
""" """
Replace each occurrence of :code:`match_ids` in :code:`token_ids` Replace each occurrence of `match_ids` in `token_ids`
with :code:`new_ids`. with `new_ids`.
Note that empty matches are ignored. Note that empty matches are ignored.
""" """
...@@ -654,7 +654,7 @@ def find_token_matches( ...@@ -654,7 +654,7 @@ def find_token_matches(
prompt: list[int], prompt: list[int],
prompt_updates: Sequence[BoundPromptUpdate], prompt_updates: Sequence[BoundPromptUpdate],
) -> Sequence[PromptTargetMatch]: ) -> Sequence[PromptTargetMatch]:
"""Return each target of :code:`prompt_updates` found in :code:`prompt`.""" """Return each target of `prompt_updates` found in `prompt`."""
def get_matches(update: BoundPromptUpdate): def get_matches(update: BoundPromptUpdate):
target = update.target target = update.target
...@@ -680,7 +680,7 @@ def find_text_matches( ...@@ -680,7 +680,7 @@ def find_text_matches(
prompt: str, prompt: str,
prompt_updates: Sequence[BoundPromptUpdate], prompt_updates: Sequence[BoundPromptUpdate],
) -> Sequence[PromptTargetMatch]: ) -> Sequence[PromptTargetMatch]:
"""Return each target of :code:`prompt_updates` found in :code:`prompt`.""" """Return each target of `prompt_updates` found in `prompt`."""
def get_matches(update: BoundPromptUpdate): def get_matches(update: BoundPromptUpdate):
target = update.target target = update.target
...@@ -707,7 +707,7 @@ def _resolve_matches( ...@@ -707,7 +707,7 @@ def _resolve_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
) -> list[PromptTargetMatch]: ) -> list[PromptTargetMatch]:
""" """
Resolve :code:`mm_matches` to ensure that there are no overlapping matches, Resolve `mm_matches` to ensure that there are no overlapping matches,
and sort them such that earlier matches take priority over later ones. and sort them such that earlier matches take priority over later ones.
""" """
matches = [m for matches in mm_matches.values() for m in matches] matches = [m for matches in mm_matches.values() for m in matches]
...@@ -731,7 +731,7 @@ def _apply_matches( ...@@ -731,7 +731,7 @@ def _apply_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
mm_item_counts: Mapping[str, int], mm_item_counts: Mapping[str, int],
) -> list[_S]: ) -> list[_S]:
"""Apply the updates in :code:`mm_matches` to :code:`prompt`.""" """Apply the updates in `mm_matches` to `prompt`."""
out_seqs = list[Union[str, list[int]]]() out_seqs = list[Union[str, list[int]]]()
prev_end_idx = 0 prev_end_idx = 0
next_idx_by_modality = defaultdict[str, int](lambda: 0) next_idx_by_modality = defaultdict[str, int](lambda: 0)
...@@ -780,7 +780,7 @@ def apply_token_matches( ...@@ -780,7 +780,7 @@ def apply_token_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
mm_item_counts: Mapping[str, int], mm_item_counts: Mapping[str, int],
) -> list[int]: ) -> list[int]:
"""Apply the updates in :code:`mm_matches` to :code:`prompt`.""" """Apply the updates in `mm_matches` to `prompt`."""
if not mm_matches: if not mm_matches:
return prompt return prompt
...@@ -794,7 +794,7 @@ def apply_text_matches( ...@@ -794,7 +794,7 @@ def apply_text_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]], mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
mm_item_counts: Mapping[str, int], mm_item_counts: Mapping[str, int],
) -> str: ) -> str:
"""Apply the updates in :code:`mm_matches` to :code:`prompt`.""" """Apply the updates in `mm_matches` to `prompt`."""
if not mm_matches: if not mm_matches:
return prompt return prompt
...@@ -809,7 +809,7 @@ def _iter_placeholders( ...@@ -809,7 +809,7 @@ def _iter_placeholders(
mm_item_counts: Mapping[str, int], mm_item_counts: Mapping[str, int],
) -> Iterable[PlaceholderFeaturesInfo]: ) -> Iterable[PlaceholderFeaturesInfo]:
""" """
Yield each set of placeholder tokens found in :code:`prompt`. Yield each set of placeholder tokens found in `prompt`.
Matches are exclusive even when multiple modalities share Matches are exclusive even when multiple modalities share
the same placeholder tokens. In that case, the modality that the same placeholder tokens. In that case, the modality that
...@@ -1016,7 +1016,7 @@ class ProcessingCache: ...@@ -1016,7 +1016,7 @@ class ProcessingCache:
) -> None: ) -> None:
""" """
Put a processed multi-modal item into the cache Put a processed multi-modal item into the cache
according to its dependencies (see :meth:`get`). according to its dependencies (see {meth}`get`).
""" """
cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
**{modality: input_item}, **{modality: input_item},
...@@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo) ...@@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
MultiModalHashes = dict[str, list[str]] MultiModalHashes = dict[str, list[str]]
""" """
A collection of hashes with a similar structure as :class:`MultiModalKwargs`. A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
""" """
...@@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
""" """
Abstract base class to process multi-modal inputs to be used in vLLM. Abstract base class to process multi-modal inputs to be used in vLLM.
Not to be confused with :class:`transformers.ProcessorMixin`. Not to be confused with {class}`transformers.ProcessorMixin`.
""" """
def __init__(self, def __init__(self,
...@@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
def _get_data_parser(self) -> MultiModalDataParser: def _get_data_parser(self) -> MultiModalDataParser:
""" """
Construct a parser to preprocess multi-modal data items Construct a parser to preprocess multi-modal data items
before passing them to :meth:`_get_hf_mm_data`. before passing them to {meth}`_get_hf_mm_data`.
You can support additional modalities by creating a subclass You can support additional modalities by creating a subclass
of :class:`MultiModalDataParser` that has additional subparsers. of {class}`MultiModalDataParser` that has additional subparsers.
""" """
return MultiModalDataParser() return MultiModalDataParser()
...@@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
) -> MultiModalDataItems: ) -> MultiModalDataItems:
""" """
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
before passing them to :meth:`_get_hf_mm_data`. before passing them to {meth}`_get_hf_mm_data`.
""" """
mm_items = self.data_parser.parse_mm_data(mm_data) mm_items = self.data_parser.parse_mm_data(mm_data)
supported_mm_limits = self.info.get_supported_mm_limits() supported_mm_limits = self.info.get_supported_mm_limits()
...@@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
inputs. inputs.
Moreover, this information is critical to determine the token positions Moreover, this information is critical to determine the token positions
in order to construct :class:`~vllm-multimodal.input.PlaceholderRange` in order to construct {class}`~vllm-multimodal.input.PlaceholderRange`
for each multi-modal item. for each multi-modal item.
""" """
raise NotImplementedError raise NotImplementedError
...@@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
""" """
Return whether the HF processor applies prompt updates. Return whether the HF processor applies prompt updates.
For most HF processors, this should be :code:`True` when multi-modal For most HF processors, this should be `True` when multi-modal
data items are passed, but :code:`False` when multi-modal embeddings data items are passed, but `False` when multi-modal embeddings
are passed. are passed.
""" """
return not any( return not any(
...@@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Most HF processors accept prompt text but not prompt tokens. Most HF processors accept prompt text but not prompt tokens.
If the HF processor adds or removes tokens that are not related to If the HF processor adds or removes tokens that are not related to
multi-modal data, you should override this method so it is consistent multi-modal data, you should override this method so it is consistent
with the output of :meth:`_apply_hf_processor_text_only` on the with the output of {meth}`_apply_hf_processor_text_only` on the
corresponding text. corresponding text.
""" """
return prompt_tokens return prompt_tokens
...@@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Since HF processor requires that text and multi-modal items Since HF processor requires that text and multi-modal items
correspond to each other, we generate dummy text using correspond to each other, we generate dummy text using
:class:`DummyInputsBuilder` to go along with the multi-modal data. {class}`DummyInputsBuilder` to go along with the multi-modal data.
""" """
mm_counts = mm_items.get_all_counts() mm_counts = mm_items.get_all_counts()
...@@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Apply the HF processor on the prompt text and multi-modal data. Apply the HF processor on the prompt text and multi-modal data.
In addition, return whether prompt updates have been applied In addition, return whether prompt updates have been applied
(for most HF processors, this should be :code:`True`). (for most HF processors, this should be `True`).
Note: Note:
If :code:`enable_hf_prompt_update=False`, we use HF processor If `enable_hf_prompt_update=False`, we use HF processor
to perform prompt updates if available; HF processor requires to perform prompt updates if available; HF processor requires
that the prompt corresponds to multi-modal items. that the prompt corresponds to multi-modal items.
""" """
......
...@@ -25,7 +25,7 @@ logger = init_logger(__name__) ...@@ -25,7 +25,7 @@ logger = init_logger(__name__)
class ProcessorInputs: class ProcessorInputs:
""" """
Represents the keyword arguments to Represents the keyword arguments to
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
""" """
prompt_text: str prompt_text: str
mm_data: MultiModalDataDict mm_data: MultiModalDataDict
...@@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): ...@@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
# TODO: @abstractmethod after transition # TODO: @abstractmethod after transition
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
""" """
Build the text input corresponding to :code:`mm_counts`. Build the text input corresponding to `mm_counts`.
""" """
if (type(self).get_dummy_processor_inputs == if (type(self).get_dummy_processor_inputs ==
BaseDummyInputsBuilder.get_dummy_processor_inputs): BaseDummyInputsBuilder.get_dummy_processor_inputs):
......
...@@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) ...@@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
class ProcessingInfoFactory(Protocol[_I_co]): class ProcessingInfoFactory(Protocol[_I_co]):
"""Constructs a :class:`MultiModalProcessor` instance from the context.""" """Constructs a {class}`MultiModalProcessor` instance from the context."""
def __call__( def __call__(
self, self,
...@@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]): ...@@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]):
class DummyInputsBuilderFactory(Protocol[_I]): class DummyInputsBuilderFactory(Protocol[_I]):
""" """
Constructs a :class:`BaseDummyInputsBuilder` instance from the context. Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
""" """
def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
...@@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]): ...@@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]):
class MultiModalProcessorFactory(Protocol[_I]): class MultiModalProcessorFactory(Protocol[_I]):
"""Constructs a :class:`MultiModalProcessor` instance from the context.""" """Constructs a {class}`MultiModalProcessor` instance from the context."""
def __call__( def __call__(
self, self,
...@@ -150,7 +150,7 @@ class MultiModalRegistry: ...@@ -150,7 +150,7 @@ class MultiModalRegistry:
Get the maximum number of tokens from each modality Get the maximum number of tokens from each modality
for profiling the memory usage of a model. for profiling the memory usage of a model.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
""" """
mm_limits = self.get_mm_limits_per_prompt(model_config) mm_limits = self.get_mm_limits_per_prompt(model_config)
...@@ -165,7 +165,7 @@ class MultiModalRegistry: ...@@ -165,7 +165,7 @@ class MultiModalRegistry:
Get the maximum number of multi-modal tokens Get the maximum number of multi-modal tokens
for profiling the memory usage of a model. for profiling the memory usage of a model.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
""" """
return sum(self.get_max_tokens_by_modality(model_config).values()) return sum(self.get_max_tokens_by_modality(model_config).values())
...@@ -208,8 +208,9 @@ class MultiModalRegistry: ...@@ -208,8 +208,9 @@ class MultiModalRegistry:
When the model receives multi-modal data, the provided function is When the model receives multi-modal data, the provided function is
invoked to transform the data into a dictionary of model inputs. invoked to transform the data into a dictionary of model inputs.
See also: :::{seealso}
:ref:`mm-processing` {ref}`mm-processing`
:::
""" """
def wrapper(model_cls: N) -> N: def wrapper(model_cls: N) -> N:
...@@ -253,8 +254,9 @@ class MultiModalRegistry: ...@@ -253,8 +254,9 @@ class MultiModalRegistry:
""" """
Create a multi-modal processor for a specific model and tokenizer. Create a multi-modal processor for a specific model and tokenizer.
See also: :::{seealso}
:ref:`mm-processing` {ref}`mm-processing`
:::
""" """
if not model_config.is_multimodal_model: if not model_config.is_multimodal_model:
raise ValueError(f"{model_config.model} is not a multimodal model") raise ValueError(f"{model_config.model} is not a multimodal model")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment