Add full API docs and improve the UX of navigating them (#17485)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
d6484ef3 · Harry Mellor · GitHub · 46fae69c · d6484ef3 · d6484ef3
Unverified Commit d6484ef3 authored May 04, 2025 by Harry Mellor Committed by GitHub May 03, 2025
20 changed files
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 import numpy as np
@@ -24,6 +24,10 @@ _M = TypeVar("_M")
 if TYPE_CHECKING:
    from .hasher import MultiModalHashDict
    from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
+else:
+    MultiModalHashDict = Any
+    MultiModalKwargs = Any
+    MultiModalPlaceholderDict = Any
 class MediaConnector:
@@ -255,7 +259,7 @@ class MediaConnector:
 global_media_connector = MediaConnector()
-"""The global :class:`MediaConnector` instance used by vLLM."""
+"""The global {class}`MediaConnector` instance used by vLLM."""
 fetch_audio = global_media_connector.fetch_audio
 fetch_image = global_media_connector.fetch_image
@@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str:
 def merge_and_sort_multimodal_metadata(
-    mm_positions: "MultiModalPlaceholderDict",
+    mm_positions: MultiModalPlaceholderDict,
-    mm_hashes: Optional["MultiModalHashDict"],
+    mm_hashes: Optional[MultiModalHashDict],
 ) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
    objects from all available modalities into a single list of 
    PlaceholderRange, sorted by their offset (starting index in the input
    sequence) in the ascending order.
-    Optionally if a MultiModalHashDict is given, same operation will be 
+    Optionally if a `MultiModalHashDict` is given, same operation will be
    applied to the object and the sorted list of hashes will be returned.
    Returns:
-        list[str]: List of item modalities in order of their positions in
+        list[str]: List of item modalities in order of their positions in the
-            the input sequence.
+        input sequence.
        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
        mm_positions.
-        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
-            given, None otherwise.
+        None otherwise.
    """
    modalities = list(mm_positions.keys())
@@ -352,7 +356,7 @@ def merge_and_sort_multimodal_metadata(
 def group_mm_inputs_by_modality(
-        mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
+        mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
    """Group consecutive MultiModalKwargs from mm_inputs with the same modality
    together into the same list for batching purpose. For MultiModalKwargs with
    multiple modalities, put them into their own list.
@@ -361,13 +365,14 @@ def group_mm_inputs_by_modality(
        mm_inputs: List of MultiModalKwargs.
    Returns:
-        list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each 
+        list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
-        inner list contains consecutive MultiModalKwargs with same modality.
+        `MultiModalKwargs`, each inner list contains consecutive
+        `MultiModalKwargs` with same modality.
    """
    if not mm_inputs:
        return []
-    def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
+    def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
        # If the input has multiple modalities, return a id as the unique key
        # for the mm_input input.
        if len(mm_input.modalities) > 1:

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -19,8 +19,6 @@ if TYPE_CHECKING:
 else:
    VllmConfig = None
-logger = init_logger(__name__)
 class CpuPlatform(Platform):
    _enum = PlatformEnum.CPU

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -454,10 +454,4 @@ finally:
 CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
-try:
+CudaPlatform.log_warnings()
-    from sphinx.ext.autodoc.mock import _MockModule
-    if not isinstance(pynvml, _MockModule):
-        CudaPlatform.log_warnings()
-except ModuleNotFoundError:
-    CudaPlatform.log_warnings()
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -146,7 +146,7 @@ class Platform:
        return self._enum == PlatformEnum.OOT
    def is_cuda_alike(self) -> bool:
-        """Stateless version of :func:`torch.cuda.is_available`."""
+        """Stateless version of {func}`torch.cuda.is_available`."""
        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
    def is_sleep_mode_available(self) -> bool:
@@ -165,7 +165,7 @@ class Platform:
        cls,
        device_id: int = 0,
    ) -> Optional[DeviceCapability]:
-        """Stateless version of :func:`torch.cuda.get_device_capability`."""
+        """Stateless version of {func}`torch.cuda.get_device_capability`."""
        return None
    @classmethod
@@ -180,7 +180,7 @@ class Platform:
        The ``capability`` argument can either be:
        - A tuple ``(major, minor)``.
-        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
        """
        current_capability = cls.get_device_capability(device_id=device_id)
        if current_capability is None:

--- a/vllm/profiler/__init__.py
+++ b/vllm/profiler/__init__.py
-# SPDX-License-Identifier: Apache-2.0
-from .layerwise_profile import layerwise_profile
-__all__ = [
-    "layerwise_profile",
-]
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1
 def array_full(token_id: int, count: int):
-    """:class:`array` equivalent of :func:`numpy.full`."""
+    """{class}`array` equivalent of {func}`numpy.full`."""
    return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
@@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct,
    def from_prompt_token_counts(
            *token_counts: tuple[int, int]) -> "SequenceData":
        """
-        Construct a :class:`SequenceData` instance by concatenating
+        Construct a {class}`SequenceData` instance by concatenating
        prompt token sequences.
        Each tuple represents one token sequence, expressed in the form
-        :code:`(token_id, count)`.
+        `(token_id, count)`.
        """
        if len(token_counts) == 0:
            return SequenceData.from_seqs([])
@@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct,
        prompt_embeds: Optional[torch.Tensor] = None,
    ) -> "SequenceData":
        """
-        Construct a :class:`SequenceData` instance from prompt and output
+        Construct a {class}`SequenceData` instance from prompt and output
        token sequences.
        """
        prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
@@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct,
 class Sequence:
    """Stores the data, status, and block information of a sequence.
-    The sequence is constructed from the :data:`DecoderOnlyInputs`
+    The sequence is constructed from the {data}`DecoderOnlyInputs`
-    (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
+    (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
-    instance passed in through the :code:`inputs` constructor argument.
+    instance passed in through the `inputs` constructor argument.
    Args:
        seq_id: The ID of the sequence.

--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
        """Create a SmallerTpProposerWorker.
        Args:
-            worker (MultiStepWorker): an actual worker wrapped with this class
+            worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an
+            actual worker wrapped with this class
            draft_ranks (List[int]): if this value is given, only the GPU ranks
            written in this value participate in draft generation
        """

--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
-            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.

--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Exaone
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
    and can be used to control the model outputs. Read the documentation from :
    class:`~transformers.PretrainedConfig` for more information.
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+        vocab_size ({obj}`int`, `optional`, defaults to 50257):
            Vocabulary size of the GPT Lingvo model. Defines the number of
-            different tokens that can be represented by the :obj:`inputs_ids`
+            different tokens that can be represented by the {obj}`inputs_ids`
-            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
            size of the model.
            Defines the different tokens that can be represented by the
            `inputs_ids` passed to the forward method of :class:
            `~transformers.EXAONEModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+        hidden_size ({obj}`int`, `optional`, defaults to 2048):
            Dimensionality of the encoder layers and the pooler layer.
-        num_layers (:obj:`int`, `optional`, defaults to 24):
+        num_layers ({obj}`int`, `optional`, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
            specified, will default to `num_attention_heads`.
        rotary_pct (`float`, *optional*, defaults to 0.25):
            percentage of hidden dimensions to allocate to rotary embeddings
-        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
            the Transformer encoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`,
+        activation_function ({obj}`str` or {obj}`function`, `optional`,
-        defaults to :obj:`"gelu_new"`):
+        defaults to {obj}`"gelu_new"`):
            The non-linear activation function (function or string) in the
-            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
+            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
-            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
-        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the
            embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
            Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            The vocabulary size of the {obj}`token_type_ids` passed when calling
-            :class:`~transformers.EXAONEModel`.
+            {class}`~transformers.EXAONEModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
            Only relevant if ``config.is_decoder=True``.
-        gradient_checkpointing (:obj:`bool`, `optional`,
+        gradient_checkpointing ({obj}`bool`, `optional`,
-        defaults to :obj:`False`):
+        defaults to {obj}`False`):
            If True, use gradient checkpointing to save memory at the expense
            of slower backward pass.
        Example::

--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -39,9 +39,9 @@ def decode_tokens(
 ) -> str:
    """
    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.decode(token_ids, ...)`.
+    `tokenizer.decode(token_ids, ...)`.
-    :code:`skip_special_tokens=None` means to use the backend's default
+    `skip_special_tokens=None` means to use the backend's default
    settings.
    """
    if skip_special_tokens is not None:
@@ -61,9 +61,9 @@ def encode_tokens(
 ) -> list[int]:
    """
    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.encode(text, ...)`.
+    `tokenizer.encode(text, ...)`.
-    :code:`add_special_tokens=None` means to use the backend's default
+    `add_special_tokens=None` means to use the backend's default
    settings.
    """

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
        """
        Gets the cumulative number of hits and queries against this cache.
-        If :code:`delta=True`, instead gets these statistics
+        If `delta=True`, instead gets these statistics
-        since the last call that also passed :code:`delta=True`.
+        since the last call that also passed `delta=True`.
        """
        info = CacheInfo(hits=self._hits, total=self._total)
@@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
 def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
    """
-    Unlike :class:`itertools.groupby`, groups are not broken by
+    Unlike {class}`itertools.groupby`, groups are not broken by
    non-contiguous data.
    """
    groups = defaultdict[_K, list[_V]](list)
@@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
-def is_in_doc_build() -> bool:
-    try:
-        from sphinx.ext.autodoc.mock import _MockModule
-        return isinstance(torch, _MockModule)
-    except ModuleNotFoundError:
-        return False
 def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
    """
    Import a Python file according to its file path.
@@ -1820,10 +1812,11 @@ class _PlaceholderBase:
    Disallows downstream usage of placeholder modules.
    We need to explicitly override each dunder method because
-    :meth:`__getattr__` is not called when they are accessed.
+    {meth}`__getattr__` is not called when they are accessed.
-    See also:
+    :::{seealso}
    [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    :::
    """
    def __getattr__(self, key: str) -> Never:
@@ -2052,9 +2045,6 @@ def direct_register_custom_op(
    library object. If you want to bind the operator to a different library,
    make sure the library object is alive when the operator is used.
    """
-    if is_in_doc_build():
-        return
    if not supports_custom_op():
        from vllm.platforms import current_platform
        assert not current_platform.is_cuda_alike(), (

--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
 # SPDX-License-Identifier: Apache-2.0
 """
+# MLA Common Components
 This file implements common components for MLA implementations.
 First we define:

--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -180,6 +180,7 @@ class KVCacheManager:
                as eagle.
        Blocks layout:
+        ```
        -----------------------------------------------------------------------
        | < computed > | < new computed > |    < new >    | < pre-allocated > |
        -----------------------------------------------------------------------
@@ -189,6 +190,7 @@ class KVCacheManager:
        ------------------------------------------------
                                          | <new full> |
                                          --------------
+        ```
        The following *_blocks are illustrated in this layout.
        Returns:

--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -308,7 +308,7 @@ class OutputProcessor:
            * If there is no queue (for usage with LLMEngine), 
              return a list of RequestOutput objects.
-        ****************** NOTE FOR DEVELOPERS ******************
+        NOTE FOR DEVELOPERS
        vLLM V1 minimizes the number of python loops over the full
        batch to ensure system overheads are minimized. This is the 
@@ -316,8 +316,6 @@ class OutputProcessor:
        If you need to touch every element of the batch, do it from
        within the loop below.
-        **********************************************************
        """
        request_outputs: list[RequestOutput] = []

--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -75,7 +75,7 @@ class RejectionSampler(nn.Module):
                outside of the rejection sampler with the default sampling
                strategy. It allows for more flexibility in the sampling
                process such as top_p, top_k sampling.
-            sampling_metadata (SamplingMetadata):
+            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                Additional metadata needed for sampling, such as temperature,
                top-k/top-p parameters, or other relevant information.
        Returns:

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -170,9 +170,10 @@ class Worker(WorkerBase):
        Then, it calculate the free memory that can be used for KV cache in
        bytes.
-        .. tip::
+        :::{tip}
        You may limit the usage of GPU memory
        by adjusting the `gpu_memory_utilization` parameter.
+        :::
        """
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
    """
    Perform sanity checks for the result of
-    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
    """
    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
    Scatter the multimodal embeddings into a contiguous tensor that represents
    the placeholder tokens.
-    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+    {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
    Args:
        embeds: The multimodal embeddings.
@@ -66,7 +66,7 @@ def gather_mm_placeholders(
    """
    Reconstructs the embeddings from the placeholder tokens.
-    This is the operation of :func:`scatter_mm_placeholders`.
+    This is the operation of {func}`scatter_mm_placeholders`.
    """
    if is_embed is None:
        return placeholders

--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.
-        .. tip::
+        :::{tip}
        You may limit the usage of GPU memory
        by adjusting the `gpu_memory_utilization` parameter.
+        :::
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.

--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -734,11 +734,11 @@ def _pythonize_sampler_output(
    cache: Optional[PythonizationCache],
 ) -> None:
    """ This function is only called when the output tensors are ready. 
-    See :class:`ModelOutput`. 
+    See {class}`ModelOutput`. 
    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
    adding a Pythonized output data structure
-    (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
+    ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
    Args:
      model_input

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -230,9 +230,10 @@ class Worker(LocalOrDistributedWorkerBase):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.
-        .. tip::
+        :::{tip}
        You may limit the usage of GPU memory
        by adjusting the `gpu_memory_utilization` parameter.
+        :::
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.