Add full API docs and improve the UX of navigating them (#17485)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
d6484ef3 · Harry Mellor · GitHub · 46fae69c · d6484ef3 · d6484ef3
Unverified Commit d6484ef3 authored May 04, 2025 by Harry Mellor Committed by GitHub May 03, 2025
20 changed files
--- a/docs/source/api/summary.md
+++ b/docs/source/api/summary.md
+# Summary
+
+(configuration)=
+
+## Configuration
+
+API documentation for vLLM's configuration classes.
+
+```{autodoc2-summary}
+    vllm.config.ModelConfig
+    vllm.config.CacheConfig
+    vllm.config.TokenizerPoolConfig
+    vllm.config.LoadConfig
+    vllm.config.ParallelConfig
+    vllm.config.SchedulerConfig
+    vllm.config.DeviceConfig
+    vllm.config.SpeculativeConfig
+    vllm.config.LoRAConfig
+    vllm.config.PromptAdapterConfig
+    vllm.config.MultiModalConfig
+    vllm.config.PoolerConfig
+    vllm.config.DecodingConfig
+    vllm.config.ObservabilityConfig
+    vllm.config.KVTransferConfig
+    vllm.config.CompilationConfig
+    vllm.config.VllmConfig
+```
+
+(offline-inference-api)=
+
+## Offline Inference
+
+LLM Class.
+
+```{autodoc2-summary}
+    vllm.LLM
+```
+
+LLM Inputs.
+
+```{autodoc2-summary}
+    vllm.inputs.PromptType
+    vllm.inputs.TextPrompt
+    vllm.inputs.TokensPrompt
+```
+
+## vLLM Engines
+
+Engine classes for offline and online inference.
+
+```{autodoc2-summary}
+    vllm.LLMEngine
+    vllm.AsyncLLMEngine
+```
+
+## Inference Parameters
+
+Inference parameters for vLLM APIs.
+
+(sampling-params)=
+(pooling-params)=
+
+```{autodoc2-summary}
+    vllm.SamplingParams
+    vllm.PoolingParams
+```
+
+(multi-modality)=
+
+## Multi-Modality
+
+vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
+via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
+
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
+
+```{autodoc2-summary}
+    vllm.multimodal.MULTIMODAL_REGISTRY
+```
+
+### Inputs
+
+User-facing inputs.
+
+```{autodoc2-summary}
+    vllm.multimodal.inputs.MultiModalDataDict
+```
+
+Internal data structures.
+
+```{autodoc2-summary}
+    vllm.multimodal.inputs.PlaceholderRange
+    vllm.multimodal.inputs.NestedTensors
+    vllm.multimodal.inputs.MultiModalFieldElem
+    vllm.multimodal.inputs.MultiModalFieldConfig
+    vllm.multimodal.inputs.MultiModalKwargsItem
+    vllm.multimodal.inputs.MultiModalKwargs
+    vllm.multimodal.inputs.MultiModalInputs
+```
+
+### Data Parsing
+
+```{autodoc2-summary}
+    vllm.multimodal.parse
+```
+
+### Data Processing
+
+```{autodoc2-summary}
+    vllm.multimodal.processing
+```
+
+### Memory Profiling
+
+```{autodoc2-summary}
+    vllm.multimodal.profiling
+```
+
+### Registry
+
+```{autodoc2-summary}
+    vllm.multimodal.registry
+```
+
+## Model Development
+
+```{autodoc2-summary}
+    vllm.model_executor.models.interfaces_base
+    vllm.model_executor.models.interfaces
+    vllm.model_executor.models.adapters
+```
--- a/docs/source/autodoc2_docstring_parser.py
+++ b/docs/source/autodoc2_docstring_parser.py
+# SPDX-License-Identifier: Apache-2.0
+from docutils import nodes
+from myst_parser.parsers.sphinx_ import MystParser
+from sphinx.ext.napoleon import docstring
+
+
+class NapoleonParser(MystParser):
+
+    def parse(self, input_string: str, document: nodes.document) -> None:
+        # Get the Sphinx configuration
+        config = document.settings.env.config
+
+        parsed_content = str(
+            docstring.GoogleDocstring(
+                str(docstring.NumpyDocstring(input_string, config)),
+                config,
+            ))
+        return super().parse(parsed_content, document)
+
+
+Parser = NapoleonParser
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -13,16 +13,17 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.

 import datetime
-import inspect
 import logging
 import os
+import re
 import sys
+from pathlib import Path

 import requests
-from sphinx.ext import autodoc

 logger = logging.getLogger(__name__)
-sys.path.append(os.path.abspath("../.."))
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.append(os.path.abspath(REPO_ROOT))

 # -- Project information -----------------------------------------------------

@@ -40,8 +41,7 @@ extensions = [
    "sphinx.ext.linkcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
+    "autodoc2",
    "myst_parser",
    "sphinxarg.ext",
    "sphinx_design",
@@ -49,7 +49,22 @@ extensions = [
 ]
 myst_enable_extensions = [
    "colon_fence",
+    "fieldlist",
 ]
+autodoc2_packages = [
+    {
+        "path": "../../vllm",
+        "exclude_dirs": ["__pycache__", "third_party"],
+    },
+]
+autodoc2_output_dir = "api"
+autodoc2_render_plugin = "myst"
+autodoc2_hidden_objects = ["dunder", "private", "inherited"]
+autodoc2_docstring_parser_regexes = [
+    (".*", "docs.source.autodoc2_docstring_parser"),
+]
+autodoc2_sort_names = True
+autodoc2_index_template = None

 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -77,6 +92,11 @@ html_theme_options = {
    'repository_url': 'https://github.com/vllm-project/vllm',
    'use_repository_button': True,
    'use_edit_page_button': True,
+    # Prevents the full API being added to the left sidebar of every page.
+    # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
+    'collapse_navbar': True,
+    # Makes API visible in the right sidebar on API reference pages.
+    'show_toc_level': 3,
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@@ -164,73 +184,64 @@ def linkcode_resolve(domain, info):
        return None
    if not info['module']:
        return None
-    filename = info['module'].replace('.', '/')
-    module = info['module']
-
-    # try to determine the correct file and line number to link to
-    obj = sys.modules[module]
-
-    # get as specific as we can
-    lineno: int = 0
-    filename: str = ""
-    try:
-        for part in info['fullname'].split('.'):
-            obj = getattr(obj, part)
-
-            # Skip decorator wrappers by checking if the object is a function
-            # and has a __wrapped__ attribute (which decorators typically set)
-            while hasattr(obj, '__wrapped__'):
-                obj = obj.__wrapped__
-
-            if not (inspect.isclass(obj) or inspect.isfunction(obj)
-                    or inspect.ismethod(obj)):
-                obj = obj.__class__  # Get the class of the instance
-
-            lineno = inspect.getsourcelines(obj)[1]
-            filename = (inspect.getsourcefile(obj)
-                        or f"{filename}.py").split("vllm/", 1)[1]
-    except Exception:
-        # For some things, like a class member, won't work, so
-        # we'll use the line number of the parent (the class)
-        pass
-
-    if filename.startswith("checkouts/"):
+
+    # Get path from module name
+    file = Path(f"{info['module'].replace('.', '/')}.py")
+    path = REPO_ROOT / file
+    if not path.exists():
+        path = REPO_ROOT / file.with_suffix("") / "__init__.py"
+    if not path.exists():
+        return None
+
+    # Get the line number of the object
+    with open(path) as f:
+        lines = f.readlines()
+    name = info['fullname'].split(".")[-1]
+    pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
+    for lineno, line in enumerate(lines, 1):
+        if not line or line.startswith("#"):
+            continue
+        if re.match(pattern, line):
+            break
+
+    # If the line number is not found, return None
+    if lineno == len(lines):
+        return None
+
+    # If the line number is found, create the URL
+    filename = path.relative_to(REPO_ROOT)
+    if "checkouts" in path.parts:
        # a PR build on readthedocs
-        pr_number = filename.split("/")[1]
-        filename = filename.split("/", 2)[2]
+        pr_number = REPO_ROOT.name
        base, branch = get_repo_base_and_branch(pr_number)
        if base and branch:
            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
-
    # Otherwise, link to the source file on the main branch
    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"


-# Mock out external dependencies here, otherwise the autodoc pages may be blank.
+# Mock out external dependencies here, otherwise sphinx-argparse won't work.
 autodoc_mock_imports = [
+    "huggingface_hub",
+    "pydantic",
+    "zmq",
+    "cloudpickle",
+    "aiohttp",
+    "starlette",
    "blake3",
-    "compressed_tensors",
    "cpuinfo",
-    "cv2",
-    "torch",
    "transformers",
    "psutil",
-    "prometheus_client",
-    "sentencepiece",
    "vllm._C",
    "PIL",
    "numpy",
-    'triton',
    "tqdm",
-    "tensorizer",
-    "pynvml",
-    "outlines",
-    "xgrammar",
-    "librosa",
-    "soundfile",
-    "gguf",
-    "lark",
-    "decord",
+    # The mocks below are required by
+    # docs/source/serving/openai_compatible_server.md's
+    # vllm.entrypoints.openai.cli_args
+    "openai",
+    "fastapi",
+    "partial_json_parser",
 ]

 for mock_target in autodoc_mock_imports:
@@ -241,18 +252,6 @@ for mock_target in autodoc_mock_imports:
            "been loaded into sys.modules when the sphinx build starts.",
            mock_target)

-
-class MockedClassDocumenter(autodoc.ClassDocumenter):
-    """Remove note about base class when a class is derived from object."""
-
-    def add_line(self, line: str, source: str, *lineno: int) -> None:
-        if line == "   Bases: :py:class:`object`":
-            return
-        super().add_line(line, source, *lineno)
-
-
-autodoc.ClassDocumenter = MockedClassDocumenter
-
 intersphinx_mapping = {
    "python": ("https://docs.python.org/3", None),
    "typing_extensions":
@@ -264,7 +263,4 @@ intersphinx_mapping = {
    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
 }

-autodoc_preserve_defaults = True
-autodoc_warningiserror = True
-
 navigation_with_keys = False
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -52,8 +52,8 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```

-More API details can be found in the {doc}`Offline Inference
-</api/offline_inference/index>` section of the API docs.
+More API details can be found in the [Offline Inference]
+(#offline-inference-api) section of the API docs.

 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.


--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -42,7 +42,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
  * [APC](#automatic-prefix-caching)
  * [LoRA](#lora-adapter)
  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * [SD](#spec_decode)
+  * [SD](#spec-decode)
  * CUDA graph
  * <abbr title="Pooling Models">pooling</abbr>
  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
@@ -122,7 +122,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
  *
  *
  *
- * [SD](#spec_decode)
+- * [SD](#spec-decode)
  * ✅
  * ✅
  * ❌
@@ -377,7 +377,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
  * ✅
  * [❌](gh-issue:8475)
  * ✅
- * [SD](#spec_decode)
+- * [SD](#spec-decode)
  * ✅
  * ✅
  * ✅

--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -194,11 +194,8 @@ contributing/vulnerability_management
 :caption: API Reference
 :maxdepth: 2

-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
+api/summary
+api/vllm/vllm
 :::

 % Latest news and acknowledgements

--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -14,7 +14,7 @@ Usually, this is automatically inferred so you don't have to specify it.
 ## Offline Inference

 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See <project:#configuration> for a list of options when initializing the model.

 ### `LLM.generate`


--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -60,7 +60,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference

 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See <project:#configuration> for a list of options when initializing the model.

 ### `LLM.encode`


--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -25,7 +25,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.

 :::{seealso}
-[API Reference](/api/offline_inference/index)
+[API Reference](#offline-inference-api)
 :::

 (configuration-options)=
@@ -33,7 +33,7 @@ Please refer to the above pages for more details about each API.
 ## Configuration Options

 This section lists the most common options for running the vLLM engine.
-For a full list, refer to the [Engine Arguments](#engine-args) page.
+For a full list, refer to the <project:#configuration> page.

 (model-resolution)=


--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -14,7 +14,7 @@ import tqdm

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler import layerwise_profile
+from vllm.profiler.layerwise_profile import layerwise_profile
 from vllm.utils import FlexibleArgumentParser

 BATCH_SIZE_DEFAULT = 1

--- a/requirements/docs.txt
+++ b/requirements/docs.txt
 sphinx==8.2.3
 sphinx-argparse==0.5.2
+sphinx-autodoc2==0.5.0
 sphinx-book-theme==1.1.4
 sphinx-copybutton==0.5.2
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
 myst-parser==4.0.1
 msgspec
-cloudpickle
 commonmark # Required by sphinx-argparse when using :markdownhelp:

 # packages to install to build the documentation
 cachetools
-pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
 torch
\ No newline at end of file
-py-cpuinfo
-transformers
-mistral_common >= 1.5.4
-aiohttp
-starlette
-scipy
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-requests
-zmq
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -112,11 +112,11 @@ class AudioTestAssets(list[AudioAsset]):


 IMAGE_ASSETS = ImageTestAssets()
-"""Singleton instance of :class:`ImageTestAssets`."""
+"""Singleton instance of {class}`ImageTestAssets`."""
 VIDEO_ASSETS = VideoTestAssets()
-"""Singleton instance of :class:`VideoTestAssets`."""
+"""Singleton instance of {class}`VideoTestAssets`."""
 AUDIO_ASSETS = AudioTestAssets()
-"""Singleton instance of :class:`AudioTestAssets`."""
+"""Singleton instance of {class}`AudioTestAssets`."""


 @pytest.fixture(scope="function", autouse=True)
@@ -724,7 +724,7 @@ def hf_runner():
 class VllmRunner:
    """
    The default value of some arguments have been modified from
-    :class:`~vllm.LLM` as follows:
+    {class}`~vllm.LLM` as follows:

    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.

--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -2,7 +2,7 @@
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
-:meth:`vllm.LLMEngine._get_eos_token_id`.
+{meth}`vllm.LLMEngine._get_eos_token_id`.
 """
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer import get_tokenizer

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -952,7 +952,7 @@ def get_client_text_logprob_generations(
        completions: list[Completion]) -> list[TextTextLogprobs]:
    '''Operates on the output of a request made to an Open-AI-protocol
    completions endpoint; obtains top-rank logprobs for each token in
-    each :class:`SequenceGroup`
+    each {class}`SequenceGroup`
    '''
    text_generations = get_client_text_generations(completions)
    text = ''.join(text_generations)

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -44,7 +44,7 @@ def create_scheduler(
                             (None)

    Returns:
-      :class:`Scheduler` instance
+      {class}`Scheduler` instance
    '''
    if max_model_len is None:
        max_model_len = max_num_batched_tokens

--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
 # SPDX-License-Identifier: Apache-2.0
 """
+# MLA Common Components
+
 This file implements common components for MLA implementations.

 First we define:

--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -550,7 +550,7 @@ def get_num_prefill_decode_query_kv_tokens(
    based on the attention metadata and the specified attention type.

    Args:
-        attn_metadata (FlashAttentionMetadata): Attention Metadata object.
+        attn_metadata (AttentionMetadata): Attention Metadata object.
        attn_type (AttentionType): The type of attention being used.
    Returns:
        Tuple[int, int, int]: A tuple containing three integers:

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -39,7 +39,7 @@ class CompilerInterface:
        Gather all the relevant information from the vLLM config,
        to compute a hash so that we can cache the compiled model.

-        See :meth:`VllmConfig.compute_hash` to check what information
+        See {meth}`VllmConfig.compute_hash` to check what information
        is already considered by default. This function should only
        consider the information that is specific to the compiler.
        """

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1911,10 +1911,10 @@ class SchedulerConfig:

    cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
    """Cuda graph capture sizes, default is 512.
-    1. if one value is provided, then the capture list would follow the pattern:
-        [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
-    2. more than one value (e.g. 1 2 128) is provided,
-        then the capture list will follow the provided list."""
+    1. if one value is provided, then the capture list would follow the
+    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
+    2. more than one value (e.g. 1 2 128) is provided, then the capture list
+    will follow the provided list."""

    delay_factor: float = 0.0
    """Apply a delay (of delay factor multiplied by previous
@@ -2888,7 +2888,7 @@ class PoolerConfig:
    pooling_type: Optional[str] = None
    """
    The pooling method of the pooling model. This should be a key in
-    :class:`vllm.model_executor.layers.pooler.PoolingType`.
+    {class}`vllm.model_executor.layers.pooler.PoolingType`.
    """

    normalize: Optional[bool] = None

--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -167,4 +167,4 @@ class HTTPConnection:


 global_http_connection = HTTPConnection()
-"""The global :class:`HTTPConnection` instance used by vLLM."""
+"""The global {class}`HTTPConnection` instance used by vLLM."""