Unverified Commit d6484ef3 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Add full API docs and improve the UX of navigating them (#17485)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 46fae69c
......@@ -235,7 +235,7 @@ class Sampler(nn.Module):
* Defer Pythonization of sampling result & logprobs
tensor
* Encapsulate arguments required for deferred Pythonization
in the :class:`SamplerOutput` structure
in the {class}`SamplerOutput` structure
Args:
logits: (num_tokens, vocab_size).
......
......@@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
A draft token_id x_{n+k} is accepted if it satisfies the
following condition
.. math::
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
\min \left( \epsilon, \delta * \exp \left(
-H(p_{\text{original}}(
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
:::{math}
p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) >
\min \left( \epsilon, \delta * \exp \left(
-H(p_{\text{original}}(
\cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
:::
where :math:`p_{\text{original}}` corresponds to target_probs
and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
where {math}`p_{\text{original}}` corresponds to target_probs
and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
specified using self._posterior_threshold and self._posterior_alpha
This method computes the posterior probabilities for the given
......
......@@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
batch.
pixel_values: The pixels in each input image.
See also:
:class:`Blip2ImageInputs`
:::{seealso}
{class}`Blip2ImageInputs`
:::
"""
if intermediate_tensors is not None:
......
......@@ -226,9 +226,9 @@ class SupportsPP(Protocol):
intermediate_tensors: Optional["IntermediateTensors"],
) -> Union[Tensor, "IntermediateTensors"]:
"""
Accept :class:`IntermediateTensors` when PP rank > 0.
Accept {class}`IntermediateTensors` when PP rank > 0.
Return :class:`IntermediateTensors` only for the last PP rank.
Return {class}`IntermediateTensors` only for the last PP rank.
"""
...
......
......@@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
batch.
pixel_values: The pixels in each input image.
See also:
:class:`LlavaImageInputs`
:::{seealso}
{class}`LlavaImageInputs`
:::
"""
if intermediate_tensors is not None:
inputs_embeds = None
......
......@@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
Unlike in LLaVA-1.5, the number of image tokens inputted to the language
model depends on the original size of the input image. Including the
original image token in the input, the required number of image tokens
is given by :func:`get_llava_next_image_feature_size`.
is given by {func}`get_llava_next_image_feature_size`.
This way, the `positions` and `attn_metadata` are consistent
with the `input_ids`.
......@@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values: The pixels in each grid patch for each input image.
image_sizes: The original `(height, width)` for each input image.
See also:
:class:`LlavaNextImageInputs`
:::{seealso}
{class}`LlavaNextImageInputs`
:::
"""
if intermediate_tensors is not None:
inputs_embeds = None
......
......@@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
batch.
pixel_values: The pixels in each input image.
See also:
:class:`Mistral3ImagePixelInputs`
:::{seealso}
{class}`Mistral3ImagePixelInputs`
:::
"""
if intermediate_tensors is not None:
inputs_embeds = None
......
......@@ -965,7 +965,7 @@ def select_tiling(
class MolmoProcessorWrapper:
"""
Wraps :class:`MolmoProcessor` so that it can be called directly.
Wraps {class}`MolmoProcessor` so that it can be called directly.
The original definition can be found here:
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
......
......@@ -12,7 +12,7 @@ import torch.nn.functional as F
from torch import Tensor, nn
class Block(nn.Module):
class BlockBase(nn.Module):
"""Block abstract module"""
def __init__(self, input_size, output_size):
......@@ -1602,7 +1602,7 @@ class AttModule(nn.Module):
return x, memory, pos_emb, att_mask
class AttBlock(Block, AttModule):
class AttBlock(BlockBase, AttModule):
"""Attention Block module to support both Attention and Block module."""
def memory_dims(self, max_len=False):
......
......@@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict):
"""
Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
"""
class PixtralProcessorAdapter:
"""
Provide a HF-compatible interface for
:class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
{class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
"""
def __init__(self, tokenizer: MistralTokenizer) -> None:
......
......@@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
"""
The logic of adding image pad tokens should only be applied in
:class:`QwenVLProcessor`, so they are patched out here.
{class}`QwenVLProcessor`, so they are patched out here.
The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
......
......@@ -19,7 +19,6 @@ import cloudpickle
import torch.nn as nn
from vllm.logger import init_logger
from vllm.utils import is_in_doc_build
from .interfaces import (has_inner_state, has_noops, is_attention_free,
is_hybrid, supports_cross_encoding,
......@@ -375,13 +374,13 @@ class _ModelRegistry:
"""
Register an external model to be used in vLLM.
:code:`model_cls` can be either:
`model_cls` can be either:
- A :class:`torch.nn.Module` class directly referencing the model.
- A string in the format :code:`<module>:<class>` which can be used to
- A {class}`torch.nn.Module` class directly referencing the model.
- A string in the format `<module>:<class>` which can be used to
lazily import the model. This is useful to avoid initializing CUDA
when importing the model and thus the related error
:code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
"""
if not isinstance(model_arch, str):
msg = f"`model_arch` should be a string, not a {type(model_arch)}"
......@@ -400,8 +399,7 @@ class _ModelRegistry:
raise ValueError(msg)
model = _LazyRegisteredModel(*split_str)
elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass(
model_cls, nn.Module)):
elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
model = _RegisteredModel.from_model_cls(model_cls)
else:
msg = ("`model_cls` should be a string or PyTorch model class, "
......
......@@ -66,7 +66,7 @@ class WeightsMapper:
class AutoWeightsLoader:
"""
Helper class to load weights into a :class:`torch.nn.Module`. It is able
Helper class to load weights into a {class}`torch.nn.Module`. It is able
to automatically detect child modules and parameters while iterating over
the weights only once.
......
......@@ -8,11 +8,12 @@ from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry()
"""
The global :class:`~MultiModalRegistry` is used by model runners to
The global {class}`~MultiModalRegistry` is used by model runners to
dispatch data processing according to the target model.
See also:
:ref:`mm-processing`
:::{seealso}
{ref}`mm-processing`
:::
"""
__all__ = [
......
......@@ -64,35 +64,35 @@ class MultiModalPlaceholderMap:
Examples:
.. code-block::
```
Prompt: |AAAA BBBB What's in these images?|
Positions: |.................................|
Prompt: |AAAA BBBB What's in these images?|
Positions: |.................................|
images = [A, B]
src_ranges = [(0, 4), (4, 8)]
dest_ranges = [(0, 4), (5, 9)]
images = [A, B]
src_ranges = [(0, 4), (4, 8)]
dest_ranges = [(0, 4), (5, 9)]
Prompt: |AAAA BBBB What's in these images?|
Positions: | ..... |
Prompt: |AAAA BBBB What's in these images?|
Positions: | ..... |
images = [A, B]
src_ranges = [(2, 4), (4, 6)]
dest_ranges = [(0, 2), (3, 5)]
images = [A, B]
src_ranges = [(2, 4), (4, 6)]
dest_ranges = [(0, 2), (3, 5)]
Prompt: |AAAA BBBB What's in these images?|
Positions: | ......... |
Prompt: |AAAA BBBB What's in these images?|
Positions: | ......... |
images = [B]
src_ranges = [(0, 4)]
dest_ranges = [(0, 4)]
images = [B]
src_ranges = [(0, 4)]
dest_ranges = [(0, 4)]
Prompt: |AAAA BBBB What's in these images?|
Positions: | .......................|
Prompt: |AAAA BBBB What's in these images?|
Positions: | .......................|
images = []
src_ranges = []
dest_ranges = []
images = []
src_ranges = []
dest_ranges = []
```
"""
seq_mm_data = seq_group.multi_modal_data
seq_mm_placeholders = seq_group.multi_modal_placeholders
......
......@@ -26,27 +26,27 @@ _T = TypeVar("_T")
HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
"""
A :class:`transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace :code:`ImageProcessor`.
A {class}`transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace `ImageProcessor`.
"""
HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
list[np.ndarray], list[torch.Tensor]]
"""
A :class:`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
A {class}`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace `VideoProcessor`.
"""
HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
"""
Represents a single audio
item, which can be passed to a HuggingFace :code:`AudioProcessor`.
item, which can be passed to a HuggingFace `AudioProcessor`.
"""
ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
"""
A :class:`transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace :code:`ImageProcessor`.
A {class}`transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace `ImageProcessor`.
Alternatively, a 3-D tensor or batch of 2-D tensors,
which are treated as image embeddings;
......@@ -55,8 +55,8 @@ these are directly passed to the model without HF processing.
VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
"""
A :class:`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace :code:`VideoProcessor`.
A {class}`transformers.image_utils.VideoInput` representing a single video
item, which can be passed to a HuggingFace `VideoProcessor`.
Alternatively, a 3-D tensor or batch of 2-D tensors,
which are treated as video embeddings;
......@@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
torch.Tensor]
"""
Represents a single audio
item, which can be passed to a HuggingFace :code:`AudioProcessor`.
item, which can be passed to a HuggingFace `AudioProcessor`.
Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
is different from that expected by the model;
......@@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]]
Either a single data item, or a list of data items.
The number of data items allowed per modality is restricted by
:code:`--limit-mm-per-prompt`.
`--limit-mm-per-prompt`.
"""
......@@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
"""
A dictionary containing an entry for each modality type to input.
The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
"""
......@@ -116,14 +116,14 @@ class PlaceholderRange:
Example:
Prompt: :code:`AAAA BBBB What is in these images?`
Prompt: `AAAA BBBB What is in these images?`
Images A and B will have:
Images A and B will have:
.. code-block::
A: PlaceholderRange(offset=0, length=4)
B: PlaceholderRange(offset=5, length=4)
```
A: PlaceholderRange(offset=0, length=4)
B: PlaceholderRange(offset=5, length=4)
```
"""
offset: int
......@@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
"""Equality check between :data:`NestedTensors` objects."""
"""Equality check between {data}`NestedTensors` objects."""
if isinstance(a, torch.Tensor):
return isinstance(b, torch.Tensor) and torch.equal(a, b)
elif isinstance(b, torch.Tensor):
......@@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
"""
A dictionary containing nested tensors which have been batched via
:meth:`MultiModalKwargs.batch`.
{meth}`MultiModalKwargs.batch`.
"""
......@@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via
class MultiModalFieldElem:
"""
Represents a keyword argument corresponding to a multi-modal item
in :class:`MultiModalKwargs`.
in {class}`MultiModalKwargs`.
"""
modality: str
......@@ -205,13 +205,13 @@ class MultiModalFieldElem:
key: str
"""
The key of this field in :class:`MultiModalKwargs`,
The key of this field in {class}`MultiModalKwargs`,
i.e. the name of the keyword argument to be passed to the model.
"""
data: NestedTensors
"""
The tensor data of this field in :class:`MultiModalKwargs`,
The tensor data of this field in {class}`MultiModalKwargs`,
i.e. the value of the keyword argument to be passed to the model.
"""
......@@ -234,7 +234,7 @@ class MultiModalFieldElem:
class BaseMultiModalField(ABC):
"""
Defines how to interpret tensor data belonging to a keyword argument in
:class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
{class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
"""
def _field_factory(self, *, modality: str, key: str):
......@@ -259,10 +259,10 @@ class BaseMultiModalField(ABC):
data: NestedTensors,
) -> Sequence[MultiModalFieldElem]:
"""
Construct :class:`MultiModalFieldElem` instances to represent
Construct {class}`MultiModalFieldElem` instances to represent
the provided data.
This is the inverse of :meth:`reduce_data`.
This is the inverse of {meth}`reduce_data`.
"""
raise NotImplementedError
......@@ -272,9 +272,9 @@ class BaseMultiModalField(ABC):
def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
"""
Merge the data from multiple instances of :class:`MultiModalFieldElem`.
Merge the data from multiple instances of {class}`MultiModalFieldElem`.
This is the inverse of :meth:`build_elems`.
This is the inverse of {meth}`build_elems`.
"""
field_types = [type(item.field) for item in elems]
if len(set(field_types)) > 1:
......@@ -286,8 +286,9 @@ class BaseMultiModalField(ABC):
@dataclass(frozen=True)
class MultiModalBatchedField(BaseMultiModalField):
"""
See also:
:func:`MultiModalFieldConfig.batched`
:::{seealso}
{func}`MultiModalFieldConfig.batched`
:::
"""
def build_elems(
......@@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField):
@dataclass(frozen=True)
class MultiModalFlatField(BaseMultiModalField):
"""
See also:
:func:`MultiModalFieldConfig.flat`
:func:`MultiModalFieldConfig.flat_from_sizes`
:::{seealso}
{func}`MultiModalFieldConfig.flat`
{func}`MultiModalFieldConfig.flat_from_sizes`
:::
"""
slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
dim: int = 0
......@@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField):
@dataclass(frozen=True)
class MultiModalSharedField(BaseMultiModalField):
"""
See also:
:func:`MultiModalFieldConfig.shared`
:::{seealso}
{func}`MultiModalFieldConfig.shared`
:::
"""
batch_size: int
......@@ -390,17 +393,17 @@ class MultiModalFieldConfig:
Example:
.. code-block::
Input:
Data: [[AAAA]
[BBBB]
[CCCC]]
Output:
Element 1: [AAAA]
Element 2: [BBBB]
Element 3: [CCCC]
```
Input:
Data: [[AAAA]
[BBBB]
[CCCC]]
Output:
Element 1: [AAAA]
Element 2: [BBBB]
Element 3: [CCCC]
```
"""
return MultiModalFieldConfig(
field=MultiModalBatchedField(),
......@@ -425,35 +428,35 @@ class MultiModalFieldConfig:
Example:
.. code-block::
Given:
slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
Input:
Data: [AAABBBBCC]
Output:
Element 1: [AAA]
Element 2: [BBBB]
Element 3: [CC]
.. code-block::
Given:
slices: [
(slice(None), slice(0, 3)),
(slice(None), slice(3, 7)),
(slice(None), slice(7, 9))]
dim: 1
Input:
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
Output:
Element 1: [[A],[A],[A]]
Element 2: [[B],[B],[B],[B]]
Element 3: [[C],[C]]
```
Given:
slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
Input:
Data: [AAABBBBCC]
Output:
Element 1: [AAA]
Element 2: [BBBB]
Element 3: [CC]
```
```
Given:
slices: [
(slice(None), slice(0, 3)),
(slice(None), slice(3, 7)),
(slice(None), slice(7, 9))]
dim: 1
Input:
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
Output:
Element 1: [[A],[A],[A]]
Element 2: [[B],[B],[B],[B]]
Element 3: [[C],[C]]
```
"""
return MultiModalFieldConfig(
field=MultiModalFlatField(slices=slices, dim=dim),
......@@ -477,36 +480,36 @@ class MultiModalFieldConfig:
Example:
.. code-block::
Given:
size_per_item: [3, 4, 2]
Input:
Data: [AAABBBBCC]
Output:
Element 1: [AAA]
Element 2: [BBBB]
Element 3: [CC]
.. code-block::
Given:
slices: [3, 4, 2]
dim: 1
Input:
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
Output:
Element 1: [[A],[A],[A]]
Element 2: [[B],[B],[B],[B]]
Element 3: [[C],[C]]
See also:
:func:`MultiModalFieldConfig.flat`
```
Given:
size_per_item: [3, 4, 2]
Input:
Data: [AAABBBBCC]
Output:
Element 1: [AAA]
Element 2: [BBBB]
Element 3: [CC]
```
```
Given:
slices: [3, 4, 2]
dim: 1
Input:
Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
Output:
Element 1: [[A],[A],[A]]
Element 2: [[B],[B],[B],[B]]
Element 3: [[C],[C]]
```
:::{seealso}
{func}`MultiModalFieldConfig.flat`
:::
"""
if size_per_item.ndim != 1:
......@@ -535,19 +538,19 @@ class MultiModalFieldConfig:
Example:
.. code-block::
Given:
batch_size: 4
```
Given:
batch_size: 4
Input:
Data: [XYZ]
Input:
Data: [XYZ]
Output:
Element 1: [XYZ]
Element 2: [XYZ]
Element 3: [XYZ]
Element 4: [XYZ]
Output:
Element 1: [XYZ]
Element 2: [XYZ]
Element 3: [XYZ]
Element 4: [XYZ]
```
"""
return MultiModalFieldConfig(
field=MultiModalSharedField(batch_size),
......@@ -570,8 +573,8 @@ class MultiModalFieldConfig:
class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
"""
A collection of :class:`MultiModalFieldElem`
corresponding to a data item in :class:`MultiModalDataItems`.
A collection of {class}`MultiModalFieldElem`
corresponding to a data item in {class}`MultiModalDataItems`.
"""
@staticmethod
......@@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
class MultiModalKwargs(UserDict[str, NestedTensors]):
"""
A dictionary that represents the keyword arguments to
:meth:`~torch.nn.Module.forward`.
{meth}`~torch.nn.Module.forward`.
The metadata :code:`items` enables us to obtain the keyword arguments
corresponding to each data item in :class:`MultiModalDataItems`, via
:meth:`get_item` and :meth:`get_items`.
The metadata `items` enables us to obtain the keyword arguments
corresponding to each data item in {class}`MultiModalDataItems`, via
{meth}`get_item` and {meth}`get_items`.
"""
@staticmethod
......@@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
@staticmethod
def from_items(items: Sequence[MultiModalKwargsItem]):
"""Construct a new :class:`MultiModalKwargs` from multiple items."""
"""Construct a new {class}`MultiModalKwargs` from multiple items."""
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
for item in items:
for key, elem in item.items():
......@@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality.
class MultiModalInputs(TypedDict):
"""
Represents the outputs of
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
{class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
ready to be passed to vLLM internals.
"""
......@@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict):
mm_placeholders: MultiModalPlaceholderDict
"""
For each modality, information about the placeholder tokens in
:code:`prompt_token_ids`.
`prompt_token_ids`.
"""
cache_salt: NotRequired[str]
......@@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict):
class MultiModalEncDecInputs(MultiModalInputs):
"""
Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor`
Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
ready to be passed to vLLM internals.
"""
......
......@@ -25,7 +25,7 @@ _I = TypeVar("_I")
class ModalityDataItems(ABC, Generic[_T, _I]):
"""
Represents data items for a modality in :class:`MultiModalDataItems`.
Represents data items for a modality in {class}`MultiModalDataItems`.
"""
def __init__(self, data: _T, modality: str) -> None:
......@@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
"""
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
such that each entry corresponds to a list.
"""
......@@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
"""
Get the number of data items belonging to a modality.
If `strict=False`, return `0` instead of raising :exc:`KeyError`
If `strict=False`, return `0` instead of raising {exc}`KeyError`
even if the modality is not found.
"""
if modality not in self:
......@@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
class MultiModalDataParser:
"""
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
:class:`MultiModalDataItems`.
Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
{class}`MultiModalDataItems`.
Args:
target_sr (float, optional): Enables automatic resampling of audio
......
......@@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]):
is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
"""
Given :attr:`full`, return a boolean mask of shape `(len(full),)`
Given {attr}`full`, return a boolean mask of shape `(len(full),)`
indicating which positions of `full` to assign embeddings to.
`None` (default) means to assign embeddings to all positions of `full`.
The embeddings are obtained by calling
:class:`SupportsMultiModal.get_multimodal_embeddings`.
{class}`SupportsMultiModal.get_multimodal_embeddings`.
"""
@staticmethod
......@@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
The token sequence or text that are part of the update.
If only part of the content corresponds to feature placeholders, you can
use :class:`PromptUpdateDetails` to specify which part.
use {class}`PromptUpdateDetails` to specify which part.
"""
PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
PromptUpdateInfo]
"""
Given the index of the processed item within :attr:`modality`,
Given the index of the processed item within {attr}`modality`,
output the corresponding token sequence (or text).
For convenience, you can directly pass in the token sequence (or text)
......@@ -213,52 +213,52 @@ class PromptInsertion(PromptUpdate):
Example:
For each image, insert a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder after the ``<s>`` token:
.. code-block:: python
PromptInsertion(
modality="image",
target="<s>",
insertion="<image>" * image_feature_size,
)
Insert these tokens at the start of the prompt:
.. code-block:: python
PromptInsertion(
modality="image",
target=PromptIndexTargets.start(),
insertion="<image>" * image_feature_size,
)
Insert these tokens after a prefix ``Images:``:
.. code-block:: python
PromptInsertion(
modality="image",
target=PromptIndexTargets.prefix("Images:"),
insertion="<image>" * image_feature_size,
)
Insert these tokens at the end of the prompt:
.. code-block:: python
PromptInsertion(
modality="image",
target=PromptIndexTargets.end(),
insertion="<image>" * image_feature_size,
)
For each image, insert a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder after the ``<s>`` token:
```python
PromptInsertion(
modality="image",
target="<s>",
insertion="<image>" * image_feature_size,
)
```
Insert these tokens at the start of the prompt:
```python
PromptInsertion(
modality="image",
target=PromptIndexTargets.start(),
insertion="<image>" * image_feature_size,
)
```
Insert these tokens after a prefix ``Images:``:
```python
PromptInsertion(
modality="image",
target=PromptIndexTargets.prefix("Images:"),
insertion="<image>" * image_feature_size,
)
```
Insert these tokens at the end of the prompt:
```python
PromptInsertion(
modality="image",
target=PromptIndexTargets.end(),
insertion="<image>" * image_feature_size,
)
```
"""
insertion: PromptUpdateContent = field(repr=False)
"""
Given the index of the processed item within :attr:`modality`,
output the token sequence (or text) to insert right after :attr:`target`.
Given the index of the processed item within {attr}`modality`,
output the token sequence (or text) to insert right after {attr}`target`.
For convenience, you can directly pass in the token sequence (or text)
instead of a function if it does not depend on the input.
......@@ -280,57 +280,57 @@ class PromptReplacement(PromptUpdate):
Example:
For each image, replace one ``<image>`` input placeholder in the prompt
with a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder:
.. code-block:: python
PromptReplacement(
modality="image",
target="<image>",
replacement="<image>" * image_feature_size,
)
As above, but further pad the feature placeholders with ``<image_bos>``
and `<image_eos>``, which are not supposed to be passed to the vision
encoder:
.. code-block:: python
PromptReplacement(
modality="image",
target="<image>",
replacement=PromptUpdateDetails(
full="".join([
"<image_bos>",
"<image>" * image_feature_size,
"<image_eos>",
]),
features="<image>" * image_feature_size,
),
)
To avoid unnecessary tokenization during prompt replacement,
we recommended passing token sequences instead of text:
.. code-block:: python
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=PromptUpdateDetails(
full=([image_bos_id] + [image_token_id] * image_feature_size
+ [image_eos_id]),
features=[image_token_id] * image_feature_size,
),
)
For each image, replace one ``<image>`` input placeholder in the prompt
with a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder:
```python
PromptReplacement(
modality="image",
target="<image>",
replacement="<image>" * image_feature_size,
)
```
As above, but further pad the feature placeholders with ``<image_bos>``
and `<image_eos>``, which are not supposed to be passed to the vision
encoder:
```python
PromptReplacement(
modality="image",
target="<image>",
replacement=PromptUpdateDetails(
full="".join([
"<image_bos>",
"<image>" * image_feature_size,
"<image_eos>",
]),
features="<image>" * image_feature_size,
),
)
```
To avoid unnecessary tokenization during prompt replacement,
we recommended passing token sequences instead of text:
```python
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=PromptUpdateDetails(
full=([image_bos_id] + [image_token_id] * image_feature_size
+ [image_eos_id]),
features=[image_token_id] * image_feature_size,
),
)
```
"""
replacement: PromptUpdateContent = field(repr=False)
"""
Given the index of the processed item within :attr:`modality`,
output the token sequence (or text) to replace :attr:`target`.
Given the index of the processed item within {attr}`modality`,
output the token sequence (or text) to replace {attr}`target`.
For convenience, you can directly pass in the token sequence (or text)
instead of a function if it does not depend on the input.
......@@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
"""Convenience function to apply :func:`full_groupby` based on modality."""
"""Convenience function to apply {func}`full_groupby` based on modality."""
return full_groupby(values, key=lambda x: x.modality)
@dataclass
class _BoundPromptSequence:
"""
A :data:`_PromptSeq` bound to a tokenizer to automatically
A {data}`_PromptSeq` bound to a tokenizer to automatically
convert between token sequence and text representations.
"""
tokenizer: AnyTokenizer = field(repr=False)
......@@ -443,8 +443,8 @@ class _BoundPromptContent:
@dataclass
class BoundPromptUpdate:
"""
A :class:`PromptUpdate` bound to a tokenizer to automatically convert
:attr:`target` and the result of :meth:`get_content` between
A {class}`PromptUpdate` bound to a tokenizer to automatically convert
{attr}`target` and the result of {meth}`get_content` between
token sequence and text representations.
"""
_origin: PromptUpdate
......@@ -479,7 +479,7 @@ class BoundPromptUpdate:
def get_content(self, item_idx: int) -> _BoundPromptContent:
"""
Given the index of the processed item within :attr:`modality`,
Given the index of the processed item within {attr}`modality`,
output the token sequence (or text) to update.
"""
content = self.content
......@@ -516,7 +516,7 @@ def iter_token_matches(
match_ids: list[int],
) -> Generator[_TokenMatch]:
"""
Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
Yield each occurrence of `match_ids` in `token_ids`.
Note that empty matches are ignored.
"""
......@@ -545,8 +545,8 @@ def replace_token_matches(
new_ids: list[int],
) -> list[int]:
"""
Replace each occurrence of :code:`match_ids` in :code:`token_ids`
with :code:`new_ids`.
Replace each occurrence of `match_ids` in `token_ids`
with `new_ids`.
Note that empty matches are ignored.
"""
......@@ -654,7 +654,7 @@ def find_token_matches(
prompt: list[int],
prompt_updates: Sequence[BoundPromptUpdate],
) -> Sequence[PromptTargetMatch]:
"""Return each target of :code:`prompt_updates` found in :code:`prompt`."""
"""Return each target of `prompt_updates` found in `prompt`."""
def get_matches(update: BoundPromptUpdate):
target = update.target
......@@ -680,7 +680,7 @@ def find_text_matches(
prompt: str,
prompt_updates: Sequence[BoundPromptUpdate],
) -> Sequence[PromptTargetMatch]:
"""Return each target of :code:`prompt_updates` found in :code:`prompt`."""
"""Return each target of `prompt_updates` found in `prompt`."""
def get_matches(update: BoundPromptUpdate):
target = update.target
......@@ -707,7 +707,7 @@ def _resolve_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
) -> list[PromptTargetMatch]:
"""
Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
Resolve `mm_matches` to ensure that there are no overlapping matches,
and sort them such that earlier matches take priority over later ones.
"""
matches = [m for matches in mm_matches.values() for m in matches]
......@@ -731,7 +731,7 @@ def _apply_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
mm_item_counts: Mapping[str, int],
) -> list[_S]:
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
"""Apply the updates in `mm_matches` to `prompt`."""
out_seqs = list[Union[str, list[int]]]()
prev_end_idx = 0
next_idx_by_modality = defaultdict[str, int](lambda: 0)
......@@ -780,7 +780,7 @@ def apply_token_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
mm_item_counts: Mapping[str, int],
) -> list[int]:
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
"""Apply the updates in `mm_matches` to `prompt`."""
if not mm_matches:
return prompt
......@@ -794,7 +794,7 @@ def apply_text_matches(
mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
mm_item_counts: Mapping[str, int],
) -> str:
"""Apply the updates in :code:`mm_matches` to :code:`prompt`."""
"""Apply the updates in `mm_matches` to `prompt`."""
if not mm_matches:
return prompt
......@@ -809,7 +809,7 @@ def _iter_placeholders(
mm_item_counts: Mapping[str, int],
) -> Iterable[PlaceholderFeaturesInfo]:
"""
Yield each set of placeholder tokens found in :code:`prompt`.
Yield each set of placeholder tokens found in `prompt`.
Matches are exclusive even when multiple modalities share
the same placeholder tokens. In that case, the modality that
......@@ -1016,7 +1016,7 @@ class ProcessingCache:
) -> None:
"""
Put a processed multi-modal item into the cache
according to its dependencies (see :meth:`get`).
according to its dependencies (see {meth}`get`).
"""
cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
**{modality: input_item},
......@@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
MultiModalHashes = dict[str, list[str]]
"""
A collection of hashes with a similar structure as :class:`MultiModalKwargs`.
A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
"""
......@@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
"""
Abstract base class to process multi-modal inputs to be used in vLLM.
Not to be confused with :class:`transformers.ProcessorMixin`.
Not to be confused with {class}`transformers.ProcessorMixin`.
"""
def __init__(self,
......@@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
def _get_data_parser(self) -> MultiModalDataParser:
"""
Construct a parser to preprocess multi-modal data items
before passing them to :meth:`_get_hf_mm_data`.
before passing them to {meth}`_get_hf_mm_data`.
You can support additional modalities by creating a subclass
of :class:`MultiModalDataParser` that has additional subparsers.
of {class}`MultiModalDataParser` that has additional subparsers.
"""
return MultiModalDataParser()
......@@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data: MultiModalDataDict,
) -> MultiModalDataItems:
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
before passing them to :meth:`_get_hf_mm_data`.
Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
before passing them to {meth}`_get_hf_mm_data`.
"""
mm_items = self.data_parser.parse_mm_data(mm_data)
supported_mm_limits = self.info.get_supported_mm_limits()
......@@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
inputs.
Moreover, this information is critical to determine the token positions
in order to construct :class:`~vllm-multimodal.input.PlaceholderRange`
in order to construct {class}`~vllm-multimodal.input.PlaceholderRange`
for each multi-modal item.
"""
raise NotImplementedError
......@@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
"""
Return whether the HF processor applies prompt updates.
For most HF processors, this should be :code:`True` when multi-modal
data items are passed, but :code:`False` when multi-modal embeddings
For most HF processors, this should be `True` when multi-modal
data items are passed, but `False` when multi-modal embeddings
are passed.
"""
return not any(
......@@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Most HF processors accept prompt text but not prompt tokens.
If the HF processor adds or removes tokens that are not related to
multi-modal data, you should override this method so it is consistent
with the output of :meth:`_apply_hf_processor_text_only` on the
with the output of {meth}`_apply_hf_processor_text_only` on the
corresponding text.
"""
return prompt_tokens
......@@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Since HF processor requires that text and multi-modal items
correspond to each other, we generate dummy text using
:class:`DummyInputsBuilder` to go along with the multi-modal data.
{class}`DummyInputsBuilder` to go along with the multi-modal data.
"""
mm_counts = mm_items.get_all_counts()
......@@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
Apply the HF processor on the prompt text and multi-modal data.
In addition, return whether prompt updates have been applied
(for most HF processors, this should be :code:`True`).
(for most HF processors, this should be `True`).
Note:
If :code:`enable_hf_prompt_update=False`, we use HF processor
If `enable_hf_prompt_update=False`, we use HF processor
to perform prompt updates if available; HF processor requires
that the prompt corresponds to multi-modal items.
"""
......
......@@ -25,7 +25,7 @@ logger = init_logger(__name__)
class ProcessorInputs:
"""
Represents the keyword arguments to
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
{meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
"""
prompt_text: str
mm_data: MultiModalDataDict
......@@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
# TODO: @abstractmethod after transition
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
"""
Build the text input corresponding to :code:`mm_counts`.
Build the text input corresponding to `mm_counts`.
"""
if (type(self).get_dummy_processor_inputs ==
BaseDummyInputsBuilder.get_dummy_processor_inputs):
......
......@@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
class ProcessingInfoFactory(Protocol[_I_co]):
"""Constructs a :class:`MultiModalProcessor` instance from the context."""
"""Constructs a {class}`MultiModalProcessor` instance from the context."""
def __call__(
self,
......@@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]):
class DummyInputsBuilderFactory(Protocol[_I]):
"""
Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
"""
def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
......@@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]):
class MultiModalProcessorFactory(Protocol[_I]):
"""Constructs a :class:`MultiModalProcessor` instance from the context."""
"""Constructs a {class}`MultiModalProcessor` instance from the context."""
def __call__(
self,
......@@ -150,7 +150,7 @@ class MultiModalRegistry:
Get the maximum number of tokens from each modality
for profiling the memory usage of a model.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
"""
mm_limits = self.get_mm_limits_per_prompt(model_config)
......@@ -165,7 +165,7 @@ class MultiModalRegistry:
Get the maximum number of multi-modal tokens
for profiling the memory usage of a model.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
"""
return sum(self.get_max_tokens_by_modality(model_config).values())
......@@ -208,8 +208,9 @@ class MultiModalRegistry:
When the model receives multi-modal data, the provided function is
invoked to transform the data into a dictionary of model inputs.
See also:
:ref:`mm-processing`
:::{seealso}
{ref}`mm-processing`
:::
"""
def wrapper(model_cls: N) -> N:
......@@ -253,8 +254,9 @@ class MultiModalRegistry:
"""
Create a multi-modal processor for a specific model and tokenizer.
See also:
:ref:`mm-processing`
:::{seealso}
{ref}`mm-processing`
:::
"""
if not model_config.is_multimodal_model:
raise ValueError(f"{model_config.model} is not a multimodal model")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment