[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to...

[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to MkDocs format for better documentation linking (#18663) Signed-off-by: Zerohertz <ohg3417@gmail.com>

[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to...
[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to MkDocs format for better documentation linking (#18663) Signed-off-by: Zerohertz <ohg3417@gmail.com>
a68e293c · Hyogeun Oh (오효근) · GitHub · 68811079 · a68e293c · a68e293c
Unverified Commit a68e293c authored May 27, 2025 by Hyogeun Oh (오효근) Committed by GitHub May 27, 2025
17 changed files
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -965,7 +965,7 @@ def select_tiling(

 class MolmoProcessorWrapper:
    """
-    Wraps {class}`MolmoProcessor` so that it can be called directly.
+    Wraps `MolmoProcessor` so that it can be called directly.

    The original definition can be found here:
    https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py

--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -67,14 +67,14 @@ class PixtralImagePixelInputs(TypedDict):
    """
    Shape: `(batch_size * num_images, num_channels, image_width, image_height)`

-    The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
+    The result of stacking `ImageEncoding.tokens` from each prompt.
    """


 class PixtralProcessorAdapter:
    """
    Provide a HF-compatible interface for
-    {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
+    `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
    """

    def __init__(self, tokenizer: MistralTokenizer) -> None:

--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -382,7 +382,8 @@ def _get_tokenizer_without_image_pad(
        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
    """
    The logic of adding image pad tokens should only be applied in
-    {class}`QwenVLProcessor`, so they are patched out here.
+    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
+    so they are patched out here.

    The definition of the wrapped tokenizer can be found here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -383,7 +383,7 @@ class _ModelRegistry:

        `model_cls` can be either:

-        - A {class}`torch.nn.Module` class directly referencing the model.
+        - A [`torch.nn.Module`][] class directly referencing the model.
        - A string in the format `<module>:<class>` which can be used to
          lazily import the model. This is useful to avoid initializing CUDA
          when importing the model and thus the related error

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -66,7 +66,7 @@ class WeightsMapper:

 class AutoWeightsLoader:
    """
-    Helper class to load weights into a {class}`torch.nn.Module`. It is able
+    Helper class to load weights into a [`torch.nn.Module`][]. It is able
    to automatically detect child modules and parameters while iterating over
    the weights only once.


--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -8,11 +8,12 @@ from .registry import MultiModalRegistry

 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
-The global {class}`~MultiModalRegistry` is used by model runners to
-dispatch data processing according to the target model.
+The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry]
+is used by model runners to dispatch data processing according to the target
+model.

 Info:
-    [mm-processing][]
+    [mm_processing](../../../design/mm_processing.html)
 """

 __all__ = [

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -29,14 +29,14 @@ _T = TypeVar("_T")

 HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"]
 """
-A {class}`transformers.image_utils.ImageInput` representing a single image
+A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
 """

 HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor",
                               list[np.ndarray], list["torch.Tensor"]]
 """
-A {class}`transformers.image_utils.VideoInput` representing a single video
+A `transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace `VideoProcessor`.
 """

@@ -48,7 +48,7 @@ item, which can be passed to a HuggingFace `AudioProcessor`.

 ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
 """
-A {class}`transformers.image_utils.ImageInput` representing a single image
+A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.

 Alternatively, a 3-D tensor or batch of 2-D tensors,
@@ -58,7 +58,7 @@ these are directly passed to the model without HF processing.

 VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"]
 """
-A {class}`transformers.image_utils.VideoInput` representing a single video
+A `transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace `VideoProcessor`.

 Alternatively, a 3-D tensor or batch of 2-D tensors,
@@ -108,7 +108,8 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.

-The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
+The built-in modalities are defined by
+[`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
 """


@@ -169,7 +170,8 @@ Uses a list instead of a tensor if the dimensions of each element do not match.


 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
-    """Equality check between {data}`NestedTensors` objects."""
+    """Equality check between
+    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
    if isinstance(a, torch.Tensor):
        return isinstance(b, torch.Tensor) and torch.equal(a, b)
    elif isinstance(b, torch.Tensor):
@@ -189,7 +191,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-{meth}`MultiModalKwargs.batch`.
+[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
 """


@@ -197,7 +199,7 @@ A dictionary containing nested tensors which have been batched via
 class MultiModalFieldElem:
    """
    Represents a keyword argument corresponding to a multi-modal item
-    in {class}`MultiModalKwargs`.
+    in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
    """

    modality: str
@@ -208,13 +210,15 @@ class MultiModalFieldElem:

    key: str
    """
-    The key of this field in {class}`MultiModalKwargs`,
+    The key of this field in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
    i.e. the name of the keyword argument to be passed to the model.
    """

    data: NestedTensors
    """
-    The tensor data of this field in {class}`MultiModalKwargs`,
+    The tensor data of this field in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
    i.e. the value of the keyword argument to be passed to the model.
    """

@@ -237,7 +241,8 @@ class MultiModalFieldElem:
 class BaseMultiModalField(ABC):
    """
    Defines how to interpret tensor data belonging to a keyword argument in
-    {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple
+    multi-modal items, and vice versa.
    """

    def _field_factory(self, *, modality: str, key: str):
@@ -262,10 +267,12 @@ class BaseMultiModalField(ABC):
        data: NestedTensors,
    ) -> Sequence[MultiModalFieldElem]:
        """
-        Construct {class}`MultiModalFieldElem` instances to represent
-        the provided data.
+        Construct
+        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
+        instances to represent the provided data.

-        This is the inverse of {meth}`reduce_data`.
+        This is the inverse of
+        [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
        """
        raise NotImplementedError

@@ -275,9 +282,11 @@ class BaseMultiModalField(ABC):

    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
        """
-        Merge the data from multiple instances of {class}`MultiModalFieldElem`.
+        Merge the data from multiple instances of
+        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].

-        This is the inverse of {meth}`build_elems`.
+        This is the inverse of
+        [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
        """
        field_types = [type(item.field) for item in elems]
        if len(set(field_types)) > 1:
@@ -290,7 +299,7 @@ class BaseMultiModalField(ABC):
 class MultiModalBatchedField(BaseMultiModalField):
    """
    Info:
-        [MultiModalFieldConfig.batched][]
+        [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
    """

    def build_elems(
@@ -320,8 +329,8 @@ class MultiModalBatchedField(BaseMultiModalField):
 class MultiModalFlatField(BaseMultiModalField):
    """
    Info:
-        [MultiModalFieldConfig.flat][]
-        [MultiModalFieldConfig.flat_from_sizes][]
+        [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
+        [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
    """
    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
    dim: int = 0
@@ -362,7 +371,7 @@ class MultiModalFlatField(BaseMultiModalField):
 class MultiModalSharedField(BaseMultiModalField):
    """
    Info:
-        [MultiModalFieldConfig.shared][]
+        [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
    """
    batch_size: int

@@ -508,7 +517,7 @@ class MultiModalFieldConfig:
        ```

        Info:
-            [MultiModalFieldConfig.flat][]
+            [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        """

        if size_per_item.ndim != 1:
@@ -572,8 +581,10 @@ class MultiModalFieldConfig:

 class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
    """
-    A collection of {class}`MultiModalFieldElem`
-    corresponding to a data item in {class}`MultiModalDataItems`.
+    A collection of
+    [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
+    corresponding to a data item in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    """

    @staticmethod
@@ -592,11 +603,13 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
 class MultiModalKwargs(UserDict[str, NestedTensors]):
    """
    A dictionary that represents the keyword arguments to
-    {meth}`~torch.nn.Module.forward`.
+    [`torch.nn.Module.forward`][].

    The metadata `items` enables us to obtain the keyword arguments
-    corresponding to each data item in {class}`MultiModalDataItems`, via
-    {meth}`get_item` and {meth}`get_items`.
+    corresponding to each data item in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
+    [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
+    [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
    """

    @staticmethod
@@ -635,7 +648,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):

    @staticmethod
    def from_items(items: Sequence[MultiModalKwargsItem]):
-        """Construct a new {class}`MultiModalKwargs` from multiple items."""
+        """Construct a new
+        [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
+        from multiple items."""
        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
        for item in items:
            for key, elem in item.items():
@@ -800,7 +815,7 @@ A dictionary containing placeholder ranges for each modality.
 class MultiModalInputs(TypedDict):
    """
    Represents the outputs of
-    {class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
    ready to be passed to vLLM internals.
    """

@@ -836,7 +851,8 @@ class MultiModalInputs(TypedDict):

 class MultiModalEncDecInputs(MultiModalInputs):
    """
-    Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
+    Represents the outputs of
+    [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
    ready to be passed to vLLM internals.
    """


--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -28,7 +28,8 @@ else:

 class ModalityDataItems(ABC, Generic[_T, _I]):
    """
-    Represents data items for a modality in {class}`MultiModalDataItems`.
+    Represents data items for a modality in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    """

    def __init__(self, data: _T, modality: str) -> None:
@@ -251,15 +252,15 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])

 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
    """
-    As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
-    such that each entry corresponds to a list.
+    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
+    normalized such that each entry corresponds to a list.
    """

    def get_count(self, modality: str, *, strict: bool = True) -> int:
        """
        Get the number of data items belonging to a modality.

-        If `strict=False`, return `0` instead of raising {exc}`KeyError`
+        If `strict=False`, return `0` instead of raising [`KeyError`][]
        even if the modality is not found.
        """
        if modality not in self:
@@ -305,8 +306,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],

 class MultiModalDataParser:
    """
-    Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
-    {class}`MultiModalDataItems`.
+    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
+    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].

    Args:
        target_sr (float, optional): Enables automatic resampling of audio

--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -114,13 +114,14 @@ class PromptUpdateDetails(Generic[_S]):

    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
    """
-    Given {attr}`full`, return a boolean mask of shape `(len(full),)`
-    indicating which positions of `full` to assign embeddings to.
+    Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
+    return a boolean mask of shape `(len(full),)` indicating which positions
+    of `full` to assign embeddings to.

    `None` (default) means to assign embeddings to all positions of `full`.

    The embeddings are obtained by calling
-    {class}`SupportsMultiModal.get_multimodal_embeddings`.
+    [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
    """

    @staticmethod
@@ -159,13 +160,15 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
 The token sequence or text that are part of the update.

 If only part of the content corresponds to feature placeholders, you can
-use {class}`PromptUpdateDetails` to specify which part.
+use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to
+specify which part.
 """

 PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
                            PromptUpdateInfo]
 """
-Given the index of the processed item within {attr}`modality`,
+Given the index of the processed item within
+[`modality`][vllm.multimodal.processing.PromptUpdate.modality],
 output the corresponding token sequence (or text).

 For convenience, you can directly pass in the token sequence (or text)
@@ -260,8 +263,10 @@ class PromptInsertion(PromptUpdate):

    insertion: PromptUpdateContent = field(repr=False)
    """
-    Given the index of the processed item within {attr}`modality`,
-    output the token sequence (or text) to insert right after {attr}`target`.
+    Given the index of the processed item within
+    [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+    output the token sequence (or text) to insert right after
+    [`target`][vllm.multimodal.processing.PromptUpdate.target].

    For convenience, you can directly pass in the token sequence (or text)
    instead of a function if it does not depend on the input.
@@ -332,8 +337,10 @@ class PromptReplacement(PromptUpdate):

    replacement: PromptUpdateContent = field(repr=False)
    """
-    Given the index of the processed item within {attr}`modality`,
-    output the token sequence (or text) to replace {attr}`target`.
+    Given the index of the processed item within
+    [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+    output the token sequence (or text) to replace
+    [`target`][vllm.multimodal.processing.PromptUpdate.target].

    For convenience, you can directly pass in the token sequence (or text)
    instead of a function if it does not depend on the input.
@@ -387,14 +394,16 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])


 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
-    """Convenience function to apply [full_groupby][] based on modality."""
+    """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby]
+    based on modality."""
    return full_groupby(values, key=lambda x: x.modality)


 @dataclass
 class _BoundPromptSequence:
    """
-    A {data}`_PromptSeq` bound to a tokenizer to automatically
+    A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound
+    to a tokenizer to automatically
    convert between token sequence and text representations.
    """
    tokenizer: AnyTokenizer = field(repr=False)
@@ -446,9 +455,11 @@ class _BoundPromptContent:
 @dataclass
 class BoundPromptUpdate:
    """
-    A {class}`PromptUpdate` bound to a tokenizer to automatically convert
-    {attr}`target` and the result of {meth}`get_content` between
-    token sequence and text representations.
+    A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound
+    to a tokenizer to automatically convert
+    [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of
+    [`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content]
+    between token sequence and text representations.
    """
    _origin: PromptUpdate
    tokenizer: AnyTokenizer = field(repr=False)
@@ -482,7 +493,8 @@ class BoundPromptUpdate:

    def get_content(self, item_idx: int) -> _BoundPromptContent:
        """
-        Given the index of the processed item within {attr}`modality`,
+        Given the index of the processed item within
+        [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
        output the token sequence (or text) to update.
        """
        content = self.content
@@ -1019,7 +1031,8 @@ class ProcessingCache:
    ) -> None:
        """
        Put a processed multi-modal item into the cache
-        according to its dependencies (see {meth}`get`).
+        according to its dependencies
+        (see [`get`][vllm.multimodal.processing.ProcessingCache.get]).
        """
        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
                                                 **{modality: input_item},
@@ -1091,7 +1104,8 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)

 MultiModalHashes = dict[str, list[str]]
 """
-A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
+A collection of hashes with a similar structure as
+[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
 """


@@ -1099,7 +1113,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
    """
    Abstract base class to process multi-modal inputs to be used in vLLM.

-    Not to be confused with {class}`transformers.ProcessorMixin`.
+    Not to be confused with `transformers.ProcessorMixin`.
    """

    def __init__(self,
@@ -1126,10 +1140,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
    def _get_data_parser(self) -> MultiModalDataParser:
        """
        Construct a parser to preprocess multi-modal data items
-        before passing them to {meth}`_get_hf_mm_data`.
+        before passing them to
+        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].

        You can support additional modalities by creating a subclass
-        of {class}`MultiModalDataParser` that has additional subparsers.
+        of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
+        that has additional subparsers.
        """
        return MultiModalDataParser()

@@ -1138,8 +1154,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        mm_data: MultiModalDataDict,
    ) -> MultiModalDataItems:
        """
-        Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
-        before passing them to {meth}`_get_hf_mm_data`.
+        Normalize
+        [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
+        to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]
+        before passing them to
+        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
        """
        mm_items = self.data_parser.parse_mm_data(mm_data)
        supported_mm_limits = self.info.get_supported_mm_limits()
@@ -1191,7 +1210,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        inputs.

        Moreover, this information is critical to determine the token positions
-        in order to construct  {class}`~vllm-multimodal.input.PlaceholderRange`
+        in order to construct
+        [`PlaceholderRange`][vllm.multimodal.inputs.PlaceholderRange]
        for each multi-modal item.
        """
        raise NotImplementedError
@@ -1315,7 +1335,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        Most HF processors accept prompt text but not prompt tokens.
        If the HF processor adds or removes tokens that are not related to
        multi-modal data, you should override this method so it is consistent
-        with the output of {meth}`_apply_hf_processor_text_only` on the
+        with the output of
+        [`_apply_hf_processor_text_only`][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_text_only]
+        on the
        corresponding text.
        """
        return prompt_tokens
@@ -1330,7 +1352,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):

        Since HF processor requires that text and multi-modal items
        correspond to each other, we generate dummy text using
-        {class}`DummyInputsBuilder` to go along with the multi-modal data.
+        [`DummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
+        to go along with the multi-modal data.
        """
        mm_counts = mm_items.get_all_counts()


--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
 class ProcessorInputs:
    """
    Represents the keyword arguments to
-    {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
    """
    prompt: Union[str, list[int]]
    mm_data: MultiModalDataDict

--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -29,7 +29,11 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)


 class ProcessingInfoFactory(Protocol[_I_co]):
-    """Constructs a {class}`MultiModalProcessor` instance from the context."""
+    """
+    Constructs a
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
+    instance from the context.
+    """

    def __call__(
        self,
@@ -40,7 +44,9 @@ class ProcessingInfoFactory(Protocol[_I_co]):

 class DummyInputsBuilderFactory(Protocol[_I]):
    """
-    Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
+    Constructs a
+    [`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
+    instance from the context.
    """

    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
@@ -48,7 +54,11 @@ class DummyInputsBuilderFactory(Protocol[_I]):


 class MultiModalProcessorFactory(Protocol[_I]):
-    """Constructs a {class}`MultiModalProcessor` instance from the context."""
+    """
+    Constructs a
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
+    instance from the context.
+    """

    def __call__(
        self,
@@ -155,8 +165,6 @@ class MultiModalRegistry:
        """
        Get the maximum number of tokens from each modality
        for profiling the memory usage of a model.
-
-        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
        """
        mm_limits = self.get_mm_limits_per_prompt(model_config)

@@ -170,8 +178,6 @@ class MultiModalRegistry:
        """
        Get the maximum number of multi-modal tokens
        for profiling the memory usage of a model.
-
-        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
        """
        return sum(self.get_max_tokens_by_modality(model_config).values())

@@ -213,9 +219,6 @@ class MultiModalRegistry:

        When the model receives multi-modal data, the provided function is
        invoked to transform the data into a dictionary of model inputs.
-
-        Info:
-            [mm-processing][]
        """

        def wrapper(model_cls: N) -> N:
@@ -258,9 +261,6 @@ class MultiModalRegistry:
    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
        """
        Create a multi-modal processor for a specific model and tokenizer.
-
-        Info:
-            [mm-processing][]
        """
        if not model_config.is_multimodal_model:
            raise ValueError(f"{model_config.model} is not a multimodal model")

--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -259,7 +259,8 @@ class MediaConnector:


 global_media_connector = MediaConnector()
-"""The global {class}`MediaConnector` instance used by vLLM."""
+"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector]
+instance used by vLLM."""

 fetch_audio = global_media_connector.fetch_audio
 fetch_image = global_media_connector.fetch_image

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -84,7 +84,7 @@ class DeviceCapability(NamedTuple):

    def to_int(self) -> int:
        """
-        Express device capability as an integer ``<major><minor>``.
+        Express device capability as an integer `<major><minor>`.

        It is assumed that the minor version is always a single digit.
        """
@@ -206,10 +206,11 @@ class Platform:
        """
        Test whether this platform is compatible with a device capability.

-        The ``capability`` argument can either be:
+        The `capability` argument can either be:

-        - A tuple ``(major, minor)``.
-        - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
        """
        current_capability = cls.get_device_capability(device_id=device_id)
        if current_capability is None:

--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1


 def array_full(token_id: int, count: int):
-    """{class}`array` equivalent of [numpy.full][]."""
+    """[`array`][] equivalent of [numpy.full][]."""
    return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count


@@ -192,8 +192,8 @@ class SequenceData(msgspec.Struct,
    def from_prompt_token_counts(
            *token_counts: tuple[int, int]) -> "SequenceData":
        """
-        Construct a {class}`SequenceData` instance by concatenating
-        prompt token sequences.
+        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
+        by concatenating prompt token sequences.

        Each tuple represents one token sequence, expressed in the form
        `(token_id, count)`.
@@ -216,8 +216,8 @@ class SequenceData(msgspec.Struct,
        prompt_embeds: Optional[torch.Tensor] = None,
    ) -> "SequenceData":
        """
-        Construct a {class}`SequenceData` instance from prompt and output
-        token sequences.
+        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
+        from prompt and output token sequences.
        """
        prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
                                     prompt_token_ids)
@@ -452,9 +452,11 @@ class SequenceData(msgspec.Struct,
 class Sequence:
    """Stores the data, status, and block information of a sequence.

-    The sequence is constructed from the {data}`DecoderOnlyInputs`
-    (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
-    instance passed in through the `inputs` constructor argument.
+    The sequence is constructed from the
+    [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only)
+    or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+    (for encoder-decoder) instance passed in through the `inputs`
+    constructor argument.

    Args:
        seq_id: The ID of the sequence.

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1005,7 +1005,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:

 def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
    """
-    Unlike {class}`itertools.groupby`, groups are not broken by
+    Unlike [`itertools.groupby`][], groups are not broken by
    non-contiguous data.
    """
    groups = defaultdict[_K, list[_V]](list)
@@ -1926,7 +1926,8 @@ class _PlaceholderBase:
    Disallows downstream usage of placeholder modules.

    We need to explicitly override each dunder method because
-    {meth}`__getattr__` is not called when they are accessed.
+    [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__]
+    is not called when they are accessed.

    Info:
        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)

--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
    """
    Perform sanity checks for the result of
-    {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
    """
    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
    Scatter the multimodal embeddings into a contiguous tensor that represents
    the placeholder tokens.

-    {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+    [`vllm.multimodal.processing.PromptUpdateDetails.is_embed`][].

    Args:
        embeds: The multimodal embeddings.

--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
    logprobs_tensor: Optional[torch.Tensor],
    cache: Optional[PythonizationCache],
 ) -> None:
-    """ This function is only called when the output tensors are ready. 
-    See {class}`ModelOutput`. 
-    
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
+    """ This function is only called when the output tensors are ready.
+    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
+
+    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
    adding a Pythonized output data structure
-    ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
+    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
+    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].

    Args:
      model_input