Unverified Commit 5984499e authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Doc] Expand Multimodal API Reference (#11852)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent ca47e176
...@@ -2,10 +2,6 @@ ...@@ -2,10 +2,6 @@
# Multi-Modality # Multi-Modality
```{eval-rst}
.. currentmodule:: vllm.multimodal
```
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
...@@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. ...@@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
## Module Contents ## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal
```
### Registry
```{eval-rst} ```{eval-rst}
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
``` ```
```{eval-rst} ## Submodules
.. autoclass:: vllm.multimodal.MultiModalRegistry
:members:
:show-inheritance:
```
### Base Classes
```{eval-rst}
.. automodule:: vllm.multimodal.base
:members:
:show-inheritance:
```
### Input Classes ```{toctree}
:maxdepth: 1
```{eval-rst} inputs
.. automodule:: vllm.multimodal.inputs parse
:members: processing
:show-inheritance: profiling
``` registry
### Audio Classes
```{eval-rst}
.. automodule:: vllm.multimodal.audio
:members:
:show-inheritance:
```
### Image Classes
```{eval-rst}
.. automodule:: vllm.multimodal.image
:members:
:show-inheritance:
```
### Video Classes
```{eval-rst}
.. automodule:: vllm.multimodal.video
:members:
:show-inheritance:
``` ```
# Input Definitions
## User-facing inputs
```{eval-rst}
.. autodata:: vllm.multimodal.MultiModalDataDict
```
## Internal data structures
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
:members:
:show-inheritance:
```
```{eval-rst}
.. autodata:: vllm.multimodal.inputs.NestedTensors
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
:members:
:show-inheritance:
```
# Data Parsing
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.parse
:members:
:member-order: bysource
```
# Data Processing
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.processing
:members:
:member-order: bysource
```
# Memory Profiling
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.profiling
:members:
:member-order: bysource
```
# Registry
## Module Contents
```{eval-rst}
.. automodule:: vllm.multimodal.registry
:members:
:member-order: bysource
```
...@@ -13,14 +13,16 @@ from vllm.utils import is_list_of ...@@ -13,14 +13,16 @@ from vllm.utils import is_list_of
from .audio import resample_audio from .audio import resample_audio
from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
ImageItem, ModalityData, MultiModalDataDict, ImageItem, ModalityData, MultiModalDataDict, VideoItem)
NestedTensors, VideoItem)
_T = TypeVar("_T") _T = TypeVar("_T")
_I = TypeVar("_I") _I = TypeVar("_I")
class ModalityDataItems(ABC, Generic[_T, _I]): class ModalityDataItems(ABC, Generic[_T, _I]):
"""
Represents data items for a modality in :class:`MultiModalDataItems`.
"""
def __init__(self, data: _T, modality: str) -> None: def __init__(self, data: _T, modality: str) -> None:
super().__init__() super().__init__()
...@@ -69,6 +71,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]): ...@@ -69,6 +71,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
"""Base class for data items that are arranged in a list."""
def get_count(self) -> int: def get_count(self) -> int:
return len(self.data) return len(self.data)
...@@ -83,7 +86,12 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): ...@@ -83,7 +86,12 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
return {} return {}
class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
torch.Tensor]):
"""
Base class for data items that are expressed as a batched embedding tensor,
or a list of embedding tensors (one per item).
"""
def get_count(self) -> int: def get_count(self) -> int:
return len(self.data) return len(self.data)
...@@ -109,7 +117,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): ...@@ -109,7 +117,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
class AudioEmbeddingItems(EmbeddingItems): class AudioEmbeddingItems(EmbeddingItems):
def __init__(self, data: NestedTensors) -> None: def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
super().__init__(data, "audio") super().__init__(data, "audio")
...@@ -137,7 +145,7 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): ...@@ -137,7 +145,7 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
class ImageEmbeddingItems(EmbeddingItems): class ImageEmbeddingItems(EmbeddingItems):
def __init__(self, data: NestedTensors) -> None: def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
super().__init__(data, "image") super().__init__(data, "image")
...@@ -163,7 +171,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): ...@@ -163,7 +171,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
class VideoEmbeddingItems(EmbeddingItems): class VideoEmbeddingItems(EmbeddingItems):
def __init__(self, data: NestedTensors) -> None: def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
super().__init__(data, "video") super().__init__(data, "video")
...@@ -172,8 +180,8 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) ...@@ -172,8 +180,8 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
""" """
As :class:`MultiModalDataDict`, but normalized such that each entry As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
corresponds to a list. such that each entry corresponds to a list.
""" """
def get_count(self, modality: str, *, strict: bool = True) -> int: def get_count(self, modality: str, *, strict: bool = True) -> int:
...@@ -226,7 +234,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], ...@@ -226,7 +234,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
class MultiModalDataParser: class MultiModalDataParser:
""" """
Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
:class:`MultiModalDataItems`.
Args: Args:
target_sr (float, optional): Enables automatic resampling of audio target_sr (float, optional): Enables automatic resampling of audio
...@@ -238,7 +247,9 @@ class MultiModalDataParser: ...@@ -238,7 +247,9 @@ class MultiModalDataParser:
self.target_sr = target_sr self.target_sr = target_sr
def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: def _is_embeddings(
self, data: object
) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
if isinstance(data, torch.Tensor): if isinstance(data, torch.Tensor):
return data.ndim == 3 return data.ndim == 3
if is_list_of(data, torch.Tensor): if is_list_of(data, torch.Tensor):
......
...@@ -33,20 +33,24 @@ _PromptSeq = Union[str, list[int]] ...@@ -33,20 +33,24 @@ _PromptSeq = Union[str, list[int]]
@dataclass @dataclass
class PromptReplacement: class PromptReplacement:
"""
Defines how to replace portions of an input prompt with placeholder tokens.
"""
modality: str modality: str
"""The modality for which the replacement is made.""" """The modality for which the replacement is made."""
target: _PromptSeq target: _PromptSeq
"""The text or token sequence to find and replace.""" """The token sequence (or text) to find and replace."""
replacement: Union[Callable[[int], _PromptSeq], replacement: Union[Callable[[int], _PromptSeq],
_PromptSeq] = field(repr=False) _PromptSeq] = field(repr=False)
""" """
Given the index of the processed item within :attr:`modality`, output the Given the index of the processed item within :attr:`modality`,
replacement text or token sequence. output the replacement token sequence (or text).
For convenience, you can pass in the replacement instead of a function For convenience, you can directly pass in the replacement token sequence
if it does not depend on the input. (or text) instead of a function if it does not depend on the input.
""" """
def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement": def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
...@@ -132,6 +136,11 @@ class _BoundPromptSequence: ...@@ -132,6 +136,11 @@ class _BoundPromptSequence:
@dataclass @dataclass
class BoundPromptReplacement: class BoundPromptReplacement:
"""
A :class:`PromptReplacement` bound to a tokenizer to automatically
convert :attr:`target` and the result of :meth:`get_replacement` between
token sequence and text representations.
"""
tokenizer: AnyTokenizer = field(repr=False) tokenizer: AnyTokenizer = field(repr=False)
modality: str modality: str
...@@ -144,6 +153,7 @@ class BoundPromptReplacement: ...@@ -144,6 +153,7 @@ class BoundPromptReplacement:
@property @property
def target(self) -> _BoundPromptSequence: def target(self) -> _BoundPromptSequence:
"""The token sequence (or text) to find and replace."""
target = self._target target = self._target
return _BoundPromptSequence( return _BoundPromptSequence(
...@@ -153,6 +163,10 @@ class BoundPromptReplacement: ...@@ -153,6 +163,10 @@ class BoundPromptReplacement:
) )
def get_replacement(self, item_idx: int) -> _BoundPromptSequence: def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
"""
Given the index of the processed item within :attr:`modality`,
output the replacement token sequence (or text).
"""
replacement = self._replacement replacement = self._replacement
if callable(replacement): if callable(replacement):
cache_key = item_idx cache_key = item_idx
...@@ -528,7 +542,7 @@ class ProcessingCache: ...@@ -528,7 +542,7 @@ class ProcessingCache:
class BaseProcessingInfo: class BaseProcessingInfo:
"""Base class containing information to perform processing.""" """Base class to provide the information necessary for data processing."""
def __init__(self, ctx: InputProcessingContext) -> None: def __init__(self, ctx: InputProcessingContext) -> None:
super().__init__() super().__init__()
......
...@@ -19,7 +19,10 @@ logger = init_logger(__name__) ...@@ -19,7 +19,10 @@ logger = init_logger(__name__)
@dataclass @dataclass
class ProcessorInputs: class ProcessorInputs:
"""Keyword arguments to :meth:`BaseMultiModalProcessor`.""" """
Represents the keyword arguments to
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
"""
prompt_text: str prompt_text: str
mm_data: MultiModalDataDict mm_data: MultiModalDataDict
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
...@@ -47,7 +50,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): ...@@ -47,7 +50,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
) -> ProcessorInputs: ) -> ProcessorInputs:
""" """
Build the input which, after processing, results in Build the input which, after processing, results in
`self.info.get_mm_max_tokens_per_item()` placeholder tokens. :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
""" """
raise NotImplementedError raise NotImplementedError
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment