Add full API docs and improve the UX of navigating them (#17485)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
d6484ef3 · Harry Mellor · GitHub · 46fae69c · d6484ef3 · d6484ef3
Unverified Commit d6484ef3 authored May 04, 2025 by Harry Mellor Committed by GitHub May 03, 2025
20 changed files
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -235,7 +235,7 @@ class Sampler(nn.Module):
        * Defer Pythonization of sampling result & logprobs
          tensor
        * Encapsulate arguments required for deferred Pythonization
-          in the :class:`SamplerOutput` structure
+          in the {class}`SamplerOutput` structure
        Args:
            logits: (num_tokens, vocab_size).

--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -107,14 +107,15 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
        A draft token_id x_{n+k} is accepted if it satisfies the
        following condition
-        .. math::
+        :::{math}
        p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
        \min \left( \epsilon, \delta * \exp \left(
            -H(p_{\text{original}}(
                \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
+        :::
-        where :math:`p_{\text{original}}` corresponds to target_probs 
+        where {math}`p_{\text{original}}` corresponds to target_probs 
-        and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
+        and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
        specified using self._posterior_threshold and self._posterior_alpha
        This method computes the posterior probabilities for the given

--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -681,8 +681,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
                batch.
            pixel_values: The pixels in each input image.
-        See also:
+        :::{seealso}
-            :class:`Blip2ImageInputs`
+        {class}`Blip2ImageInputs`
+        :::
        """
        if intermediate_tensors is not None:

--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -226,9 +226,9 @@ class SupportsPP(Protocol):
        intermediate_tensors: Optional["IntermediateTensors"],
    ) -> Union[Tensor, "IntermediateTensors"]:
        """
-        Accept :class:`IntermediateTensors` when PP rank > 0.
+        Accept {class}`IntermediateTensors` when PP rank > 0.
-        Return :class:`IntermediateTensors` only for the last PP rank.
+        Return {class}`IntermediateTensors` only for the last PP rank.
        """
        ...

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -721,8 +721,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                batch.
            pixel_values: The pixels in each input image.
-        See also:
+        :::{seealso}
-            :class:`LlavaImageInputs`
+        {class}`LlavaImageInputs`
+        :::
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -537,7 +537,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
        Unlike in LLaVA-1.5, the number of image tokens inputted to the language
        model depends on the original size of the input image. Including the
        original image token in the input, the required number of image tokens
-        is given by :func:`get_llava_next_image_feature_size`.
+        is given by {func}`get_llava_next_image_feature_size`.
        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.
@@ -548,8 +548,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
            pixel_values: The pixels in each grid patch for each input image.
            image_sizes: The original `(height, width)` for each input image.
-        See also:
+        :::{seealso}
-            :class:`LlavaNextImageInputs`
+        {class}`LlavaNextImageInputs`
+        :::
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -559,8 +559,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
                batch.
            pixel_values: The pixels in each input image.
-        See also:
+        :::{seealso}
-            :class:`Mistral3ImagePixelInputs`
+        {class}`Mistral3ImagePixelInputs`
+        :::
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -965,7 +965,7 @@ def select_tiling(
 class MolmoProcessorWrapper:
    """
-    Wraps :class:`MolmoProcessor` so that it can be called directly.
+    Wraps {class}`MolmoProcessor` so that it can be called directly.
    The original definition can be found here:
    https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py

--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -12,7 +12,7 @@ import torch.nn.functional as F
 from torch import Tensor, nn
-class Block(nn.Module):
+class BlockBase(nn.Module):
    """Block abstract module"""
    def __init__(self, input_size, output_size):
@@ -1602,7 +1602,7 @@ class AttModule(nn.Module):
        return x, memory, pos_emb, att_mask
-class AttBlock(Block, AttModule):
+class AttBlock(BlockBase, AttModule):
    """Attention Block module to support both Attention and Block module."""
    def memory_dims(self, max_len=False):

--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict):
    """
    Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
-    The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
+    The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
    """
 class PixtralProcessorAdapter:
    """
    Provide a HF-compatible interface for
-    :class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
+    {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
    """
    def __init__(self, tokenizer: MistralTokenizer) -> None:

--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad(
        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
    """
    The logic of adding image pad tokens should only be applied in
-    :class:`QwenVLProcessor`, so they are patched out here.
+    {class}`QwenVLProcessor`, so they are patched out here.
    The definition of the wrapped tokenizer can be found here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -19,7 +19,6 @@ import cloudpickle
 import torch.nn as nn
 from vllm.logger import init_logger
-from vllm.utils import is_in_doc_build
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                         is_hybrid, supports_cross_encoding,
@@ -375,13 +374,13 @@ class _ModelRegistry:
        """
        Register an external model to be used in vLLM.
-        :code:`model_cls` can be either:
+        `model_cls` can be either:
-        - A :class:`torch.nn.Module` class directly referencing the model.
+        - A {class}`torch.nn.Module` class directly referencing the model.
-        - A string in the format :code:`<module>:<class>` which can be used to
+        - A string in the format `<module>:<class>` which can be used to
          lazily import the model. This is useful to avoid initializing CUDA
          when importing the model and thus the related error
-          :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+          `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
        """
        if not isinstance(model_arch, str):
            msg = f"`model_arch` should be a string, not a {type(model_arch)}"
@@ -400,8 +399,7 @@ class _ModelRegistry:
                raise ValueError(msg)
            model = _LazyRegisteredModel(*split_str)
-        elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass(
+        elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
-                model_cls, nn.Module)):
            model = _RegisteredModel.from_model_cls(model_cls)
        else:
            msg = ("`model_cls` should be a string or PyTorch model class, "

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -66,7 +66,7 @@ class WeightsMapper:
 class AutoWeightsLoader:
    """
-    Helper class to load weights into a :class:`torch.nn.Module`. It is able
+    Helper class to load weights into a {class}`torch.nn.Module`. It is able
    to automatically detect child modules and parameters while iterating over
    the weights only once.

--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -8,11 +8,12 @@ from .registry import MultiModalRegistry
 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
-The global :class:`~MultiModalRegistry` is used by model runners to
+The global {class}`~MultiModalRegistry` is used by model runners to
 dispatch data processing according to the target model.
-See also:
+:::{seealso}
-    :ref:`mm-processing`
+{ref}`mm-processing`
+:::
 """
 __all__ = [

--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -64,8 +64,7 @@ class MultiModalPlaceholderMap:
        Examples:
-        .. code-block::
+        ```
        Prompt:    |AAAA BBBB What's in these images?|
        Positions: |.................................|
@@ -93,6 +92,7 @@ class MultiModalPlaceholderMap:
            images      = []
            src_ranges  = []
            dest_ranges = []
+        ```
        """
        seq_mm_data = seq_group.multi_modal_data
        seq_mm_placeholders = seq_group.multi_modal_placeholders

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -26,27 +26,27 @@ _T = TypeVar("_T")
 HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image
+A {class}`transformers.image_utils.ImageInput` representing a single image
-item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+item, which can be passed to a HuggingFace `ImageProcessor`.
 """
 HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
                               list[np.ndarray], list[torch.Tensor]]
 """
-A :class:`transformers.image_utils.VideoInput` representing a single video
+A {class}`transformers.image_utils.VideoInput` representing a single video
-item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+item, which can be passed to a HuggingFace `VideoProcessor`.
 """
 HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
 """
 Represents a single audio
-item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+item, which can be passed to a HuggingFace `AudioProcessor`.
 """
 ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image
+A {class}`transformers.image_utils.ImageInput` representing a single image
-item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+item, which can be passed to a HuggingFace `ImageProcessor`.
 Alternatively, a 3-D tensor or batch of 2-D tensors,
 which are treated as image embeddings;
@@ -55,8 +55,8 @@ these are directly passed to the model without HF processing.
 VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
 """
-A :class:`transformers.image_utils.VideoInput` representing a single video
+A {class}`transformers.image_utils.VideoInput` representing a single video
-item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+item, which can be passed to a HuggingFace `VideoProcessor`.
 Alternatively, a 3-D tensor or batch of 2-D tensors,
 which are treated as video embeddings;
@@ -67,7 +67,7 @@ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
                             torch.Tensor]
 """
 Represents a single audio
-item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+item, which can be passed to a HuggingFace `AudioProcessor`.
 Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
 is different from that expected by the model;
@@ -83,7 +83,7 @@ ModalityData: TypeAlias = Union[_T, list[_T]]
 Either a single data item, or a list of data items.
 The number of data items allowed per modality is restricted by
-:code:`--limit-mm-per-prompt`.
+`--limit-mm-per-prompt`.
 """
@@ -105,7 +105,7 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
-The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
+The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
 """
@@ -116,14 +116,14 @@ class PlaceholderRange:
    Example:
-        Prompt: :code:`AAAA BBBB What is in these images?`
+    Prompt: `AAAA BBBB What is in these images?`
    Images A and B will have:
-        .. code-block::
+    ```
    A: PlaceholderRange(offset=0, length=4)
    B: PlaceholderRange(offset=5, length=4)
+    ```
    """
    offset: int
@@ -166,7 +166,7 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
-    """Equality check between :data:`NestedTensors` objects."""
+    """Equality check between {data}`NestedTensors` objects."""
    if isinstance(a, torch.Tensor):
        return isinstance(b, torch.Tensor) and torch.equal(a, b)
    elif isinstance(b, torch.Tensor):
@@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-:meth:`MultiModalKwargs.batch`.
+{meth}`MultiModalKwargs.batch`.
 """
@@ -194,7 +194,7 @@ A dictionary containing nested tensors which have been batched via
 class MultiModalFieldElem:
    """
    Represents a keyword argument corresponding to a multi-modal item
-    in :class:`MultiModalKwargs`.
+    in {class}`MultiModalKwargs`.
    """
    modality: str
@@ -205,13 +205,13 @@ class MultiModalFieldElem:
    key: str
    """
-    The key of this field in :class:`MultiModalKwargs`,
+    The key of this field in {class}`MultiModalKwargs`,
    i.e. the name of the keyword argument to be passed to the model.
    """
    data: NestedTensors
    """
-    The tensor data of this field in :class:`MultiModalKwargs`,
+    The tensor data of this field in {class}`MultiModalKwargs`,
    i.e. the value of the keyword argument to be passed to the model.
    """
@@ -234,7 +234,7 @@ class MultiModalFieldElem:
 class BaseMultiModalField(ABC):
    """
    Defines how to interpret tensor data belonging to a keyword argument in
-    :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
    """
    def _field_factory(self, *, modality: str, key: str):
@@ -259,10 +259,10 @@ class BaseMultiModalField(ABC):
        data: NestedTensors,
    ) -> Sequence[MultiModalFieldElem]:
        """
-        Construct :class:`MultiModalFieldElem` instances to represent
+        Construct {class}`MultiModalFieldElem` instances to represent
        the provided data.
-        This is the inverse of :meth:`reduce_data`.
+        This is the inverse of {meth}`reduce_data`.
        """
        raise NotImplementedError
@@ -272,9 +272,9 @@ class BaseMultiModalField(ABC):
    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
        """
-        Merge the data from multiple instances of :class:`MultiModalFieldElem`.
+        Merge the data from multiple instances of {class}`MultiModalFieldElem`.
-        This is the inverse of :meth:`build_elems`.
+        This is the inverse of {meth}`build_elems`.
        """
        field_types = [type(item.field) for item in elems]
        if len(set(field_types)) > 1:
@@ -286,8 +286,9 @@ class BaseMultiModalField(ABC):
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
    """
-    See also:
+    :::{seealso}
-        :func:`MultiModalFieldConfig.batched`
+    {func}`MultiModalFieldConfig.batched`
+    :::
    """
    def build_elems(
@@ -316,9 +317,10 @@ class MultiModalBatchedField(BaseMultiModalField):
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
    """
-    See also:
+    :::{seealso}
-        :func:`MultiModalFieldConfig.flat`
+    {func}`MultiModalFieldConfig.flat`
-        :func:`MultiModalFieldConfig.flat_from_sizes`
+    {func}`MultiModalFieldConfig.flat_from_sizes`
+    :::
    """
    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
    dim: int = 0
@@ -358,8 +360,9 @@ class MultiModalFlatField(BaseMultiModalField):
 @dataclass(frozen=True)
 class MultiModalSharedField(BaseMultiModalField):
    """
-    See also:
+    :::{seealso}
-        :func:`MultiModalFieldConfig.shared`
+    {func}`MultiModalFieldConfig.shared`
+    :::
    """
    batch_size: int
@@ -390,8 +393,7 @@ class MultiModalFieldConfig:
        Example:
-            .. code-block::
+        ```
        Input:
            Data: [[AAAA]
                [BBBB]
@@ -401,6 +403,7 @@ class MultiModalFieldConfig:
            Element 1: [AAAA]
            Element 2: [BBBB]
            Element 3: [CCCC]
+        ```
        """
        return MultiModalFieldConfig(
            field=MultiModalBatchedField(),
@@ -425,8 +428,7 @@ class MultiModalFieldConfig:
        Example:
-            .. code-block::
+        ```
        Given:
            slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
@@ -437,9 +439,9 @@ class MultiModalFieldConfig:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
+        ```
-            .. code-block::
+        ```
        Given:
            slices: [
                (slice(None), slice(0, 3)),
@@ -454,6 +456,7 @@ class MultiModalFieldConfig:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
+        ```
        """
        return MultiModalFieldConfig(
            field=MultiModalFlatField(slices=slices, dim=dim),
@@ -477,8 +480,7 @@ class MultiModalFieldConfig:
        Example:
-            .. code-block::
+        ```
        Given:
            size_per_item: [3, 4, 2]
@@ -489,10 +491,9 @@ class MultiModalFieldConfig:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
+        ```
+        ```
-            .. code-block::
        Given:
            slices: [3, 4, 2]
            dim: 1
@@ -504,9 +505,11 @@ class MultiModalFieldConfig:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
+        ```
-        See also:
+        :::{seealso}
-            :func:`MultiModalFieldConfig.flat`
+        {func}`MultiModalFieldConfig.flat`
+        :::
        """
        if size_per_item.ndim != 1:
@@ -535,8 +538,7 @@ class MultiModalFieldConfig:
        Example:
-            .. code-block::
+        ```
        Given:
            batch_size: 4
@@ -548,6 +550,7 @@ class MultiModalFieldConfig:
            Element 2: [XYZ]
            Element 3: [XYZ]
            Element 4: [XYZ]
+        ```
        """
        return MultiModalFieldConfig(
            field=MultiModalSharedField(batch_size),
@@ -570,8 +573,8 @@ class MultiModalFieldConfig:
 class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
    """
-    A collection of :class:`MultiModalFieldElem`
+    A collection of {class}`MultiModalFieldElem`
-    corresponding to a data item in :class:`MultiModalDataItems`.
+    corresponding to a data item in {class}`MultiModalDataItems`.
    """
    @staticmethod
@@ -590,11 +593,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
 class MultiModalKwargs(UserDict[str, NestedTensors]):
    """
    A dictionary that represents the keyword arguments to
-    :meth:`~torch.nn.Module.forward`.
+    {meth}`~torch.nn.Module.forward`.
-    The metadata :code:`items` enables us to obtain the keyword arguments
+    The metadata `items` enables us to obtain the keyword arguments
-    corresponding to each data item in :class:`MultiModalDataItems`, via
+    corresponding to each data item in {class}`MultiModalDataItems`, via
-    :meth:`get_item` and :meth:`get_items`.
+    {meth}`get_item` and {meth}`get_items`.
    """
    @staticmethod
@@ -633,7 +636,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
    @staticmethod
    def from_items(items: Sequence[MultiModalKwargsItem]):
-        """Construct a new :class:`MultiModalKwargs` from multiple items."""
+        """Construct a new {class}`MultiModalKwargs` from multiple items."""
        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
        for item in items:
            for key, elem in item.items():
@@ -798,7 +801,7 @@ A dictionary containing placeholder ranges for each modality.
 class MultiModalInputs(TypedDict):
    """
    Represents the outputs of
-    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
+    {class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
    ready to be passed to vLLM internals.
    """
@@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict):
    mm_placeholders: MultiModalPlaceholderDict
    """
    For each modality, information about the placeholder tokens in
-    :code:`prompt_token_ids`.
+    `prompt_token_ids`.
    """
    cache_salt: NotRequired[str]
@@ -834,7 +837,7 @@ class MultiModalInputs(TypedDict):
 class MultiModalEncDecInputs(MultiModalInputs):
    """
-    Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor`
+    Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
    ready to be passed to vLLM internals.
    """

--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -25,7 +25,7 @@ _I = TypeVar("_I")
 class ModalityDataItems(ABC, Generic[_T, _I]):
    """
-    Represents data items for a modality in :class:`MultiModalDataItems`.
+    Represents data items for a modality in {class}`MultiModalDataItems`.
    """
    def __init__(self, data: _T, modality: str) -> None:
@@ -246,7 +246,7 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
    """
-    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
    such that each entry corresponds to a list.
    """
@@ -254,7 +254,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
        """
        Get the number of data items belonging to a modality.
-        If `strict=False`, return `0` instead of raising :exc:`KeyError`
+        If `strict=False`, return `0` instead of raising {exc}`KeyError`
        even if the modality is not found.
        """
        if modality not in self:
@@ -300,8 +300,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
 class MultiModalDataParser:
    """
-    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
-    :class:`MultiModalDataItems`.
+    {class}`MultiModalDataItems`.
    Args:
        target_sr (float, optional): Enables automatic resampling of audio

--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -111,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]):
    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
    """
-    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
+    Given {attr}`full`, return a boolean mask of shape `(len(full),)`
    indicating which positions of `full` to assign embeddings to.
    `None` (default) means to assign embeddings to all positions of `full`.
    The embeddings are obtained by calling
-    :class:`SupportsMultiModal.get_multimodal_embeddings`.
+    {class}`SupportsMultiModal.get_multimodal_embeddings`.
    """
    @staticmethod
@@ -156,13 +156,13 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
 The token sequence or text that are part of the update.
 If only part of the content corresponds to feature placeholders, you can
-use :class:`PromptUpdateDetails` to specify which part.
+use {class}`PromptUpdateDetails` to specify which part.
 """
 PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
                            PromptUpdateInfo]
 """
-Given the index of the processed item within :attr:`modality`,
+Given the index of the processed item within {attr}`modality`,
 output the corresponding token sequence (or text).
 For convenience, you can directly pass in the token sequence (or text)
@@ -216,49 +216,49 @@ class PromptInsertion(PromptUpdate):
    For each image, insert a number of ``<image>`` feature placeholders
    equal to the feature size of the vision encoder after the ``<s>`` token:
-        .. code-block:: python
+    ```python
    PromptInsertion(
        modality="image",
        target="<s>",
        insertion="<image>" * image_feature_size,
    )
+    ```
    Insert these tokens at the start of the prompt:
-        .. code-block:: python
+    ```python
    PromptInsertion(
        modality="image",
        target=PromptIndexTargets.start(),
        insertion="<image>" * image_feature_size,
    )
+    ```
    Insert these tokens after a prefix ``Images:``:
-        .. code-block:: python
+    ```python
    PromptInsertion(
        modality="image",
        target=PromptIndexTargets.prefix("Images:"),
        insertion="<image>" * image_feature_size,
    )
+    ```
    Insert these tokens at the end of the prompt:
-        .. code-block:: python
+    ```python
    PromptInsertion(
        modality="image",
        target=PromptIndexTargets.end(),
        insertion="<image>" * image_feature_size,
    )
+    ```
    """
    insertion: PromptUpdateContent = field(repr=False)
    """
-    Given the index of the processed item within :attr:`modality`,
+    Given the index of the processed item within {attr}`modality`,
-    output the token sequence (or text) to insert right after :attr:`target`.
+    output the token sequence (or text) to insert right after {attr}`target`.
    For convenience, you can directly pass in the token sequence (or text)
    instead of a function if it does not depend on the input.
@@ -284,20 +284,19 @@ class PromptReplacement(PromptUpdate):
    with a number of ``<image>`` feature placeholders
    equal to the feature size of the vision encoder:
-        .. code-block:: python
+    ```python
    PromptReplacement(
        modality="image",
        target="<image>",
        replacement="<image>" * image_feature_size,
    )
+    ```
    As above, but further pad the feature placeholders with ``<image_bos>``
    and `<image_eos>``, which are not supposed to be passed to the vision
    encoder:
-        .. code-block:: python
+    ```python
    PromptReplacement(
        modality="image",
        target="<image>",
@@ -310,12 +309,12 @@ class PromptReplacement(PromptUpdate):
            features="<image>" * image_feature_size,
        ),
    )
+    ```
    To avoid unnecessary tokenization during prompt replacement,
    we recommended passing token sequences instead of text:
-        .. code-block:: python
+    ```python
    PromptReplacement(
        modality="image",
        target=[image_token_id],
@@ -325,12 +324,13 @@ class PromptReplacement(PromptUpdate):
            features=[image_token_id] * image_feature_size,
        ),
    )
+    ```
    """
    replacement: PromptUpdateContent = field(repr=False)
    """
-    Given the index of the processed item within :attr:`modality`,
+    Given the index of the processed item within {attr}`modality`,
-    output the token sequence (or text) to replace :attr:`target`.
+    output the token sequence (or text) to replace {attr}`target`.
    For convenience, you can directly pass in the token sequence (or text)
    instead of a function if it does not depend on the input.
@@ -384,14 +384,14 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
-    """Convenience function to apply :func:`full_groupby` based on modality."""
+    """Convenience function to apply {func}`full_groupby` based on modality."""
    return full_groupby(values, key=lambda x: x.modality)
 @dataclass
 class _BoundPromptSequence:
    """
-    A :data:`_PromptSeq` bound to a tokenizer to automatically
+    A {data}`_PromptSeq` bound to a tokenizer to automatically
    convert between token sequence and text representations.
    """
    tokenizer: AnyTokenizer = field(repr=False)
@@ -443,8 +443,8 @@ class _BoundPromptContent:
 @dataclass
 class BoundPromptUpdate:
    """
-    A :class:`PromptUpdate` bound to a tokenizer to automatically convert
+    A {class}`PromptUpdate` bound to a tokenizer to automatically convert
-    :attr:`target` and the result of :meth:`get_content` between
+    {attr}`target` and the result of {meth}`get_content` between
    token sequence and text representations.
    """
    _origin: PromptUpdate
@@ -479,7 +479,7 @@ class BoundPromptUpdate:
    def get_content(self, item_idx: int) -> _BoundPromptContent:
        """
-        Given the index of the processed item within :attr:`modality`,
+        Given the index of the processed item within {attr}`modality`,
        output the token sequence (or text) to update.
        """
        content = self.content
@@ -516,7 +516,7 @@ def iter_token_matches(
    match_ids: list[int],
 ) -> Generator[_TokenMatch]:
    """
-    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+    Yield each occurrence of `match_ids` in `token_ids`.
    Note that empty matches are ignored.
    """
@@ -545,8 +545,8 @@ def replace_token_matches(
    new_ids: list[int],
 ) -> list[int]:
    """
-    Replace each occurrence of :code:`match_ids` in :code:`token_ids`
+    Replace each occurrence of `match_ids` in `token_ids`
-    with :code:`new_ids`.
+    with `new_ids`.
    Note that empty matches are ignored.
    """
@@ -654,7 +654,7 @@ def find_token_matches(
    prompt: list[int],
    prompt_updates: Sequence[BoundPromptUpdate],
 ) -> Sequence[PromptTargetMatch]:
-    """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
+    """Return each target of `prompt_updates` found in `prompt`."""
    def get_matches(update: BoundPromptUpdate):
        target = update.target
@@ -680,7 +680,7 @@ def find_text_matches(
    prompt: str,
    prompt_updates: Sequence[BoundPromptUpdate],
 ) -> Sequence[PromptTargetMatch]:
-    """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
+    """Return each target of `prompt_updates` found in `prompt`."""
    def get_matches(update: BoundPromptUpdate):
        target = update.target
@@ -707,7 +707,7 @@ def _resolve_matches(
    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
 ) -> list[PromptTargetMatch]:
    """
-    Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
+    Resolve `mm_matches` to ensure that there are no overlapping matches,
    and sort them such that earlier matches take priority over later ones.
    """
    matches = [m for matches in mm_matches.values() for m in matches]
@@ -731,7 +731,7 @@ def _apply_matches(
    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
    mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
-    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in `mm_matches` to `prompt`."""
    out_seqs = list[Union[str, list[int]]]()
    prev_end_idx = 0
    next_idx_by_modality = defaultdict[str, int](lambda: 0)
@@ -780,7 +780,7 @@ def apply_token_matches(
    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
    mm_item_counts: Mapping[str, int],
 ) -> list[int]:
-    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in `mm_matches` to `prompt`."""
    if not mm_matches:
        return prompt
@@ -794,7 +794,7 @@ def apply_text_matches(
    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
    mm_item_counts: Mapping[str, int],
 ) -> str:
-    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in `mm_matches` to `prompt`."""
    if not mm_matches:
        return prompt
@@ -809,7 +809,7 @@ def _iter_placeholders(
    mm_item_counts: Mapping[str, int],
 ) -> Iterable[PlaceholderFeaturesInfo]:
    """
-    Yield each set of placeholder tokens found in :code:`prompt`.
+    Yield each set of placeholder tokens found in `prompt`.
    Matches are exclusive even when multiple modalities share
    the same placeholder tokens. In that case, the modality that
@@ -1016,7 +1016,7 @@ class ProcessingCache:
    ) -> None:
        """
        Put a processed multi-modal item into the cache
-        according to its dependencies (see :meth:`get`).
+        according to its dependencies (see {meth}`get`).
        """
        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
                                                 **{modality: input_item},
@@ -1083,7 +1083,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
 MultiModalHashes = dict[str, list[str]]
 """
-A collection of hashes with a similar structure as :class:`MultiModalKwargs`.
+A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
 """
@@ -1091,7 +1091,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
    """
    Abstract base class to process multi-modal inputs to be used in vLLM.
-    Not to be confused with :class:`transformers.ProcessorMixin`.
+    Not to be confused with {class}`transformers.ProcessorMixin`.
    """
    def __init__(self,
@@ -1118,10 +1118,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
    def _get_data_parser(self) -> MultiModalDataParser:
        """
        Construct a parser to preprocess multi-modal data items
-        before passing them to :meth:`_get_hf_mm_data`.
+        before passing them to {meth}`_get_hf_mm_data`.
        You can support additional modalities by creating a subclass
-        of :class:`MultiModalDataParser` that has additional subparsers.
+        of {class}`MultiModalDataParser` that has additional subparsers.
        """
        return MultiModalDataParser()
@@ -1130,8 +1130,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        mm_data: MultiModalDataDict,
    ) -> MultiModalDataItems:
        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
+        Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
-        before passing them to :meth:`_get_hf_mm_data`.
+        before passing them to {meth}`_get_hf_mm_data`.
        """
        mm_items = self.data_parser.parse_mm_data(mm_data)
        supported_mm_limits = self.info.get_supported_mm_limits()
@@ -1183,7 +1183,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        inputs.
        Moreover, this information is critical to determine the token positions
-        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
+        in order to construct  {class}`~vllm-multimodal.input.PlaceholderRange`
        for each multi-modal item.
        """
        raise NotImplementedError
@@ -1237,8 +1237,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        """
        Return whether the HF processor applies prompt updates.
-        For most HF processors, this should be :code:`True` when multi-modal
+        For most HF processors, this should be `True` when multi-modal
-        data items are passed, but :code:`False` when multi-modal embeddings
+        data items are passed, but `False` when multi-modal embeddings
        are passed.
        """
        return not any(
@@ -1307,7 +1307,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        Most HF processors accept prompt text but not prompt tokens.
        If the HF processor adds or removes tokens that are not related to
        multi-modal data, you should override this method so it is consistent
-        with the output of :meth:`_apply_hf_processor_text_only` on the
+        with the output of {meth}`_apply_hf_processor_text_only` on the
        corresponding text.
        """
        return prompt_tokens
@@ -1322,7 +1322,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        Since HF processor requires that text and multi-modal items
        correspond to each other, we generate dummy text using
-        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        {class}`DummyInputsBuilder` to go along with the multi-modal data.
        """
        mm_counts = mm_items.get_all_counts()
@@ -1346,10 +1346,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        Apply the HF processor on the prompt text and multi-modal data.
        In addition, return whether prompt updates have been applied
-        (for most HF processors, this should be :code:`True`).
+        (for most HF processors, this should be `True`).
        Note:
-            If :code:`enable_hf_prompt_update=False`, we use HF processor
+            If `enable_hf_prompt_update=False`, we use HF processor
            to perform prompt updates if available; HF processor requires
            that the prompt corresponds to multi-modal items.
        """

--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
 class ProcessorInputs:
    """
    Represents the keyword arguments to
-    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
    """
    prompt_text: str
    mm_data: MultiModalDataDict
@@ -63,7 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
    # TODO: @abstractmethod after transition
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        """
-        Build the text input corresponding to :code:`mm_counts`.
+        Build the text input corresponding to `mm_counts`.
        """
        if (type(self).get_dummy_processor_inputs ==
                BaseDummyInputsBuilder.get_dummy_processor_inputs):

--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -29,7 +29,7 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 class ProcessingInfoFactory(Protocol[_I_co]):
-    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    """Constructs a {class}`MultiModalProcessor` instance from the context."""
    def __call__(
        self,
@@ -40,7 +40,7 @@ class ProcessingInfoFactory(Protocol[_I_co]):
 class DummyInputsBuilderFactory(Protocol[_I]):
    """
-    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
    """
    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
@@ -48,7 +48,7 @@ class DummyInputsBuilderFactory(Protocol[_I]):
 class MultiModalProcessorFactory(Protocol[_I]):
-    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    """Constructs a {class}`MultiModalProcessor` instance from the context."""
    def __call__(
        self,
@@ -150,7 +150,7 @@ class MultiModalRegistry:
        Get the maximum number of tokens from each modality
        for profiling the memory usage of a model.
-        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
        """
        mm_limits = self.get_mm_limits_per_prompt(model_config)
@@ -165,7 +165,7 @@ class MultiModalRegistry:
        Get the maximum number of multi-modal tokens
        for profiling the memory usage of a model.
-        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
        """
        return sum(self.get_max_tokens_by_modality(model_config).values())
@@ -208,8 +208,9 @@ class MultiModalRegistry:
        When the model receives multi-modal data, the provided function is
        invoked to transform the data into a dictionary of model inputs.
-        See also:
+        :::{seealso}
-            :ref:`mm-processing`
+        {ref}`mm-processing`
+        :::
        """
        def wrapper(model_cls: N) -> N:
@@ -253,8 +254,9 @@ class MultiModalRegistry:
        """
        Create a multi-modal processor for a specific model and tokenizer.
-        See also:
+        :::{seealso}
-            :ref:`mm-processing`
+        {ref}`mm-processing`
+        :::
        """
        if not model_config.is_multimodal_model:
            raise ValueError(f"{model_config.model} is not a multimodal model")