[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to...

[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to MkDocs format for better documentation linking (#18663) Signed-off-by: Zerohertz <ohg3417@gmail.com>

[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to...
[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to MkDocs format for better documentation linking (#18663) Signed-off-by: Zerohertz <ohg3417@gmail.com>
a68e293c · Hyogeun Oh (오효근) · GitHub · 68811079 · a68e293c · a68e293c
Unverified Commit a68e293c authored May 27, 2025 by Hyogeun Oh (오효근) Committed by GitHub May 27, 2025
20 changed files
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -39,7 +39,8 @@ class CompilerInterface:
        Gather all the relevant information from the vLLM config,
        to compute a hash so that we can cache the compiled model.
-        See {meth}`VllmConfig.compute_hash` to check what information
+        See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
+        to check what information
        is already considered by default. This function should only
        consider the information that is specific to the compiler.
        """

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2986,7 +2986,7 @@ class PoolerConfig:
    pooling_type: Optional[str] = None
    """
    The pooling method of the pooling model. This should be a key in
-    {class}`vllm.model_executor.layers.pooler.PoolingType`.
+    [`vllm.model_executor.layers.pooler.PoolingType`][].
    """
    normalize: Optional[bool] = None
@@ -3697,23 +3697,27 @@ class CompilationConfig:
    """Configuration for compilation. It has three parts:
    - Top-level Compilation control:
-        - {attr}`level`
+        - [`level`][vllm.config.CompilationConfig.level]
-        - {attr}`debug_dump_path`
+        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
-        - {attr}`cache_dir`
+        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
-        - {attr}`backend`
+        - [`backend`][vllm.config.CompilationConfig.backend]
-        - {attr}`custom_ops`
+        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
-        - {attr}`splitting_ops`
+        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
    - CudaGraph capture:
-        - {attr}`use_cudagraph`
+        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
-        - {attr}`cudagraph_capture_sizes`
+        - [`cudagraph_capture_sizes`]
-        - {attr}`cudagraph_num_of_warmups`
+        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
-        - {attr}`cudagraph_copy_inputs`
+        - [`cudagraph_num_of_warmups`]
-        - {attr}`full_cuda_graph`
+        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
+        - [`cudagraph_copy_inputs`]
+        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
+        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
    - Inductor compilation:
-        - {attr}`use_inductor`
+        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
-        - {attr}`compile_sizes`
+        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
-        - {attr}`inductor_compile_config`
+        - [`inductor_compile_config`]
-        - {attr}`inductor_passes`
+        [vllm.config.CompilationConfig.inductor_compile_config]
+        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
        - custom inductor passes
    Why we have different sizes for cudagraph and inductor:

--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -167,4 +167,7 @@ class HTTPConnection:
 global_http_connection = HTTPConnection()
-"""The global {class}`HTTPConnection` instance used by vLLM."""
+"""
+The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
+by vLLM.
+"""
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -475,7 +475,8 @@ class _AsyncLLMEngine(LLMEngine):
            *,
            inputs: Optional[PromptType] = None,  # DEPRECATED
    ) -> None:
-        """Async version of {meth}`add_request`."""
+        """Async version of
+        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
        if inputs is not None:
            prompt = inputs
        assert prompt is not None and params is not None
@@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async(
 class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for {class}`LLMEngine`.
+    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
-    This class is used to wrap the {class}`LLMEngine` class to make it
+    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
-    asynchronous. It uses asyncio to create a background loop that keeps
+    make it asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The {class}`LLMEngine` is kicked by the
+    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
-    generate method when there are requests in the waiting queue. The generate
+    by the generate method when there are requests in the waiting queue. The
-    method yields the outputs from the {class}`LLMEngine` to the caller.
+    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
+    to the caller.
    Args:
        log_requests: Whether to log the requests.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
-        *args: Arguments for {class}`LLMEngine`.
+        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
-        **kwargs: Arguments for {class}`LLMEngine`.
+        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
    """
    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@@ -985,8 +987,9 @@ class AsyncLLMEngine(EngineClient):
        from the LLMEngine to the caller.
        Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See
-                for more details about the format of each input.
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
            sampling_params: The sampling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
@@ -1003,7 +1006,7 @@ class AsyncLLMEngine(EngineClient):
        Details:
            - If the engine is not running, start the background loop,
              which iteratively invokes
-              {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
              to process the waiting requests.
            - Add the request to the engine's `RequestTracker`.
              On the next background loop, this request will be sent to
@@ -1075,8 +1078,9 @@ class AsyncLLMEngine(EngineClient):
        from the LLMEngine to the caller.
        Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See
-                for more details about the format of each input.
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
            pooling_params: The pooling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
@@ -1089,15 +1093,15 @@ class AsyncLLMEngine(EngineClient):
            for the request.
        Details:
-        - If the engine is not running, start the background loop,
+            - If the engine is not running, start the background loop,
-            which iteratively invokes
+                which iteratively invokes
-            {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
-            to process the waiting requests.
+                to process the waiting requests.
-        - Add the request to the engine's `RequestTracker`.
+            - Add the request to the engine's `RequestTracker`.
-            On the next background loop, this request will be sent to
+                On the next background loop, this request will be sent to
-            the underlying engine.
+                the underlying engine.
-            Also, a corresponding `AsyncStream` will be created.
+                Also, a corresponding `AsyncStream` will be created.
-        - Wait for the request outputs from `AsyncStream` and yield them.
+            - Wait for the request outputs from `AsyncStream` and yield them.
        Example:
        ```

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -130,11 +130,11 @@ class LLMEngine:
    iteration-level scheduling and efficient memory management to maximize the
    serving throughput.
-    The [LLM][vllm.LLM] class wraps this class for offline batched inference
+    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
-    and the [AsyncLLMEngine][] class wraps this class for online serving.
+    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
+    class wraps this class for online serving.
-    The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See
+    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
-    [engine-args][])
    Args:
        vllm_config: The configuration for initializing and running vLLM.

--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -492,8 +492,9 @@ class MQLLMEngineClient(EngineClient):
        from the LLMEngine to the caller.
        Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See
-                for more details about the format of each input.
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
            sampling_params: The sampling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
@@ -561,8 +562,9 @@ class MQLLMEngineClient(EngineClient):
        from the LLMEngine to the caller.
        Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See
-                for more details about the format of each input.
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
            pooling_params: The pooling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.

--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -42,19 +42,22 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
 class MQLLMEngine:
-    """A multiprocessing wrapper for {class}`LLMEngine`.
+    """A multiprocessing wrapper for
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-    This class is used to wrap the {class}`LLMEngine` class to enable use
+    This class is used to wrap the
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
    in concurrnet manner. It runs a background loop and uses zeromq to
    receive new requests and stream outputs incrementally via ipc.
-    The {class}`LLMEngine` generate or encode process is kicked off when a new
+    The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
-    RPCProcessRequest is received by the input_socket.
+    process is kicked off when a new RPCProcessRequest is received by the
+    input_socket.
    The self.engine_loop checks the input_socket for new requests,
    adds them to the LLMEngine if there are any, calls the internal
-    {class}`LLMEngine.step()`, and sends the RequestOutputs back over
+    [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
-    the output_socket.
+    the RequestOutputs back over the output_socket.
    If use_async_sockets is set, the logic associated with reading new
    requests from the socket and sending data to the socket is passed
@@ -65,8 +68,8 @@ class MQLLMEngine:
        ipc_path: Base path for zeromq interprocess messaging
        use_async_sockets: Whether to make send/recv async with GPU
        log_requests: Whether to log the requests.
-        *args: Arguments for {class}`LLMEngine`.
+        *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-        **kwargs: Arguments for {class}`LLMEngine`.
+        **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
    """
    def __init__(self,

--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -56,8 +56,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
        scheduled computation.
        Args:
-          seq_group: the outputs are associated with this {class}`SequenceGroup`
+          seq_group: the outputs are associated with this
-          outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
+              for all scheduler steps
        """
        for output in outputs:
            # Concatenate single-step prompt logprob processing results.

--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -19,17 +19,21 @@ logger = init_logger(__name__)
 def single_step_process_prompt_logprob(
        sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
        output: CompletionSequenceGroupOutput) -> None:
-    """Process prompt logprobs associated with the {class}`SequenceGroupOutput`
+    """Process prompt logprobs associated with the
-    for a given step.
+    [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
    Do nothing if the output has no prompt logprobs.
    Account for the fact that transformers do not compute first-token logprobs.
    Args:
-      sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
+      sg_output_proc:
-      seq_group: the output is associated with this {class}`SequenceGroup`
+          [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
-      output: the {class}`SequenceGroupOutput` for a single scheduler step
+          instance
+      seq_group: the output is associated with this
+          [`SequenceGroup`][vllm.sequence.SequenceGroup]
+      output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+          for a single scheduler step
    """
    prompt_logprobs = output.prompt_logprobs
@@ -103,8 +107,11 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
        scheduled computation.
        Args:
-          seq_group: the output is associated with this {class}`SequenceGroup`
+          seq_group: the output is associated with this
-          outputs: the {class}`SequenceGroupOutput` for a single scheduler step
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+              for a single scheduler step
        """
        assert len(outputs) == 1, "Single step should only have 1 output."
        output = outputs[0]

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -129,8 +129,7 @@ class LLM:
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the level of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
-        **kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See
+        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
-            [engine-args][])
    Note:
        This class is intended to be used for offline inference. For online
@@ -494,7 +493,7 @@ class LLM:
                `self` argument, in addition to the arguments passed in `args`
                and `kwargs`. The `self` argument will be the worker object.
            timeout: Maximum time in seconds to wait for execution. Raises a
-                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
            args: Positional arguments to pass to the worker method.
            kwargs: Keyword arguments to pass to the worker method.

--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -582,7 +582,8 @@ class OpenAIServing:
        add_special_tokens: bool = True,
    ) -> TextTokensPrompt:
        """
-        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
        that assumes single input.
        """
        return next(
@@ -603,7 +604,8 @@ class OpenAIServing:
        add_special_tokens: bool = True,
    ) -> Iterator[TextTokensPrompt]:
        """
-        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
        that assumes multiple inputs.
        """
        for text in prompt_inputs:

--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -74,7 +74,7 @@ class ExecutorBase(ABC):
                `self` argument, in addition to the arguments passed in `args`
                and `kwargs`. The `self` argument will be the worker object.
            timeout: Maximum time in seconds to wait for execution. Raises a
-                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
            args: Positional arguments to pass to the worker method.
            kwargs: Keyword arguments to pass to the worker method.

--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -10,8 +10,9 @@ from .registry import (DummyData, InputContext, InputProcessingContext,
 INPUT_REGISTRY = InputRegistry()
 """
-The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
+The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
-to dispatch data processing according to the target model.
+by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
+target model.
 """
 __all__ = [

--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -80,22 +80,24 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
 """
 Set of possible schemas for a single prompt:
- A text prompt ({class}`str` or {class}`TextPrompt`)
+- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt ({class}`TokensPrompt`)
+- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ({class}`EmbedsPrompt`)
+- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
 Note that "singleton" is as opposed to a data structure
 which encapsulates multiple prompts, i.e. of the sort
 which may be utilized for encoder/decoder models when
 the user desires to express both the encoder & decoder
-prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt`
+prompts explicitly, i.e. 
+[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
-A prompt of type {class}`SingletonPrompt` may be employed
+A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be 
-as (1) input to a decoder-only model, (2) input to
+employed as (1) input to a decoder-only model, (2) input to
 the encoder of an encoder/decoder model, in the scenario
 where the decoder-prompt is not specified explicitly, or
 (3) as a member of a larger data structure encapsulating
-more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt`
+more than one prompt, i.e. 
+[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
 """
@@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
    comprising an explicit encoder prompt and a decoder prompt.
    The encoder and decoder prompts, respectively, may be formatted
-    according to any of the {class}`SingletonPrompt` schemas,
+    according to any of the
+    [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas,
    and are not required to have the same schema.
    Only the encoder prompt may have multi-modal data. mm_processor_kwargs
    should be at the top-level, and should not be set in the encoder/decoder
    prompts, since they are agnostic to the encoder/decoder.
-    Note that an {class}`ExplicitEncoderDecoderPrompt` may not
+    Note that an
-    be used as an input to a decoder-only model,
+    [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+    may not be used as an input to a decoder-only model,
    and that the `encoder_prompt` and `decoder_prompt`
    fields of this data structure themselves must be
-    {class}`SingletonPrompt` instances.
+    [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances.
    """
    encoder_prompt: _T1_co
@@ -152,11 +156,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
 Set of possible schemas for an LLM input, including
 both decoder-only and encoder/decoder input types:
- A text prompt ({class}`str` or {class}`TextPrompt`)
+- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt ({class}`TokensPrompt`)
+- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ({class}`EmbedsPrompt`)
+- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
 - A single data structure containing both an encoder and a decoder prompt
-  ({class}`ExplicitEncoderDecoderPrompt`)
+  ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt])
 """
@@ -189,7 +193,8 @@ def token_inputs(
    prompt: Optional[str] = None,
    cache_salt: Optional[str] = None,
 ) -> TokenInputs:
-    """Construct {class}`TokenInputs` from optional values."""
+    """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
+    values."""
    inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
    if prompt is not None:
@@ -221,7 +226,8 @@ def embeds_inputs(
    prompt_embeds: torch.Tensor,
    cache_salt: Optional[str] = None,
 ) -> EmbedsInputs:
-    """Construct :class:`EmbedsInputs` from optional values."""
+    """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
+    values."""
    inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
    if cache_salt is not None:
@@ -232,7 +238,7 @@ def embeds_inputs(
 DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-The inputs in {class}`~vllm.LLMEngine` before they are
+The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
 passed to the model executor.
 This specifies the data required for decoder-only models.
 """
@@ -240,11 +246,12 @@ This specifies the data required for decoder-only models.
 class EncoderDecoderInputs(TypedDict):
    """
-    The inputs in {class}`~vllm.LLMEngine` before they are
+    The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they
-    passed to the model executor.
+    are passed to the model executor.
    This specifies the required data for encoder-decoder models.
    """
    encoder: Union[TokenInputs, "MultiModalInputs"]
    """The inputs for the encoder portion."""
@@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict):
 SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-A processed {class}`SingletonPrompt` which can be passed to
+A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be 
-{class}`vllm.sequence.Sequence`.
+passed to [`vllm.sequence.Sequence`][].
 """
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
 """
-The inputs to {data}`vllm.inputs.InputProcessor`.
+The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
 """
 _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
@@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt(
    return ExplicitEncoderDecoderPrompt(
        encoder_prompt=encoder_prompt,
        decoder_prompt=decoder_prompt,
-        mm_processor_kwargs=mm_processor_kwargs)
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
 def zip_enc_dec_prompts(
@@ -288,7 +296,8 @@ def zip_enc_dec_prompts(
 ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
    """
    Zip encoder and decoder prompts together into a list of
-    {class}`ExplicitEncoderDecoderPrompt` instances.
+    [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+    instances.
    ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
    dictionary will be used for every encoder/decoder prompt. If an iterable is
@@ -299,10 +308,11 @@ def zip_enc_dec_prompts(
    if isinstance(mm_processor_kwargs, dict):
        return [
            build_explicit_enc_dec_prompt(
-                encoder_prompt, decoder_prompt,
+                encoder_prompt,
-                cast(dict[str, Any], mm_processor_kwargs))
+                decoder_prompt,
-            for (encoder_prompt,
+                cast(dict[str, Any], mm_processor_kwargs),
-                 decoder_prompt) in zip(enc_prompts, dec_prompts)
+            ) for (encoder_prompt,
+                   decoder_prompt) in zip(enc_prompts, dec_prompts)
        ]
    return [
        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,

--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -23,13 +23,13 @@ class ParsedTokens(TypedDict):
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[str, list[str]]) -> Sequence[ParsedText]:
+    prompt: Union[str, list[str]], ) -> Sequence[ParsedText]:
    ...
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]:
+    prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]:
    ...
@@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict):
 class ParsedEmbedsPrompt(TypedDict):
-    type: Literal['embeds']
+    type: Literal["embeds"]
    content: EmbedsPrompt
@@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
 def is_explicit_encoder_decoder_prompt(
-        prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
+    prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]:
    return isinstance(prompt, dict) and "encoder_prompt" in prompt

--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -67,11 +67,11 @@ class InputPreprocessor:
        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
    def get_decoder_start_token_id(self) -> Optional[int]:
-        '''
+        """
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
-        '''
+        """
        if not self.model_config.is_encoder_decoder:
            logger.warning_once(
@@ -79,14 +79,14 @@ class InputPreprocessor:
                "this is not an encoder/decoder model.")
            return None
-        if (self.model_config is None or self.model_config.hf_config is None):
+        if self.model_config is None or self.model_config.hf_config is None:
            logger.warning_once(
                "Using None for decoder start token id because "
                "model config is not available.")
            return None
        dec_start_token_id = getattr(self.model_config.hf_config,
-                                     'decoder_start_token_id', None)
+                                     "decoder_start_token_id", None)
        if dec_start_token_id is None:
            logger.warning_once(
                "Falling back on <BOS> for decoder start token "
@@ -97,7 +97,7 @@ class InputPreprocessor:
        return dec_start_token_id
    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
-        '''
+        """
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.
@@ -126,7 +126,7 @@ class InputPreprocessor:
        Returns:
        * prompt_token_ids
-        '''
+        """
        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
@@ -224,7 +224,10 @@ class InputPreprocessor:
        lora_request: Optional[LoRARequest],
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> list[int]:
-        """Async version of {meth}`_tokenize_prompt`."""
+        """
+        Async version of
+        [`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
+        """
        tokenizer = self.get_tokenizer_group()
        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
@@ -287,7 +290,10 @@ class InputPreprocessor:
        lora_request: Optional[LoRARequest],
        return_mm_hashes: bool = False,
    ) -> MultiModalInputs:
-        """Async version of {meth}`_process_multimodal`."""
+        """
+        Async version of
+        [`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
+        """
        tokenizer = await self._get_mm_tokenizer_async(lora_request)
        mm_processor = self.mm_registry.create_processor(self.model_config,
@@ -472,7 +478,7 @@ class InputPreprocessor:
        Returns:
-        * {class}`SingletonInputs` instance
+        * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
        """
        parsed = parse_singleton_prompt(prompt)
@@ -508,7 +514,10 @@ class InputPreprocessor:
        lora_request: Optional[LoRARequest] = None,
        return_mm_hashes: bool = False,
    ) -> SingletonInputs:
-        """Async version of {meth}`_prompt_to_llm_inputs`."""
+        """
+        Async version of
+        [`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs].
+        """
        parsed = parse_singleton_prompt(prompt)
        if parsed["type"] == "embeds":
@@ -644,7 +653,9 @@ class InputPreprocessor:
    ) -> EncoderDecoderInputs:
        """
        For encoder/decoder models only:
-        Process an input prompt into an {class}`EncoderDecoderInputs` instance.
+        Process an input prompt into an
+        [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+        instance.
        There are two types of input prompts:
        singleton prompts which carry only the
@@ -670,7 +681,8 @@ class InputPreprocessor:
        Returns:
-        * {class}`EncoderDecoderInputs` instance
+        * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+          instance
        """
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
@@ -710,7 +722,10 @@ class InputPreprocessor:
        prompt: PromptType,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> EncoderDecoderInputs:
-        """Async version of {meth}`_process_encoder_decoder_prompt`."""
+        """
+        Async version of
+        [`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt].
+        """
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
@@ -778,7 +793,8 @@ class InputPreprocessor:
    ) -> DecoderOnlyInputs:
        """
        For decoder-only models:
-        Process an input prompt into an {class}`DecoderOnlyInputs` instance.
+        Process an input prompt into a
+        [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
        Arguments:
@@ -789,7 +805,7 @@ class InputPreprocessor:
        Returns:
-        * {class}`DecoderOnlyInputs` instance
+        * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
        """
        prompt_comps = self._prompt_to_llm_inputs(
@@ -812,7 +828,10 @@ class InputPreprocessor:
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
        return_mm_hashes: bool = False,
    ) -> DecoderOnlyInputs:
-        """Async version of {meth}`_process_decoder_only_prompt`."""
+        """
+        Async version of
+        [`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt].
+        """
        prompt_comps = await self._prompt_to_llm_inputs_async(
            prompt,
            tokenization_kwargs=tokenization_kwargs,
@@ -863,7 +882,10 @@ class InputPreprocessor:
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
        return_mm_hashes: bool = False,
    ) -> ProcessorInputs:
-        """Async version of {meth}`preprocess`."""
+        """
+        Async version of
+        [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
+        """
        if self.model_config.is_encoder_decoder:
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",

--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -38,7 +38,7 @@ class InputContext:
    ) -> _C:
        """
        Get the HuggingFace configuration
-        ({class}`transformers.PretrainedConfig`) of the model,
+        (`transformers.PretrainedConfig`) of the model,
        additionally checking its type.
        Raises:
@@ -79,7 +79,7 @@ class InputContext:
    ) -> _P:
        """
        Get the HuggingFace processor
-        ({class}`transformers.ProcessorMixin`) of the model,
+        (`transformers.ProcessorMixin`) of the model,
        additionally checking its type.
        Raises:

--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -68,22 +68,22 @@ class _VllmLogger(Logger):
    """
    Note:
        This class is just to provide type information.
-        We actually patch the methods directly on the {class}`logging.Logger`
+        We actually patch the methods directly on the [`logging.Logger`][]
        instance to avoid conflicting with other libraries such as
        `intel_extension_for_pytorch.utils._logger`.
    """
    def info_once(self, msg: str, *args: Hashable) -> None:
        """
-        As {meth}`info`, but subsequent calls with the same message
+        As [`info`][logging.Logger.info], but subsequent calls with
-        are silently dropped.
+        the same message are silently dropped.
        """
        _print_info_once(self, msg, *args)
    def warning_once(self, msg: str, *args: Hashable) -> None:
        """
-        As {meth}`warning`, but subsequent calls with the same message
+        As [`warning`][logging.Logger.warning], but subsequent calls with
-        are silently dropped.
+        the same message are silently dropped.
        """
        _print_warning_once(self, msg, *args)

--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -228,17 +228,19 @@ class Sampler(nn.Module):
    ) -> Optional[SamplerOutput]:
        """
        Single-step scheduling:
-        * Perform GPU-side sampling computation & compute
+            * Perform GPU-side sampling computation & compute
-          GPU-side logprobs tensor
+            GPU-side logprobs tensor
-        * Pythonize sampling result & logprobs tensor
+            * Pythonize sampling result & logprobs tensor
        Multi-step scheduling:
-        * Perform GPU-side sampling computation & compute
+            * Perform GPU-side sampling computation & compute
-          GPU-side logprobs tensor
+            GPU-side logprobs tensor
-        * Defer Pythonization of sampling result & logprobs
+            * Defer Pythonization of sampling result & logprobs
-          tensor
+            tensor
-        * Encapsulate arguments required for deferred Pythonization
+            * Encapsulate arguments required for deferred Pythonization
-          in the {class}`SamplerOutput` structure
+            in the
+            [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput]
+            structure
        Args:
            logits: (num_tokens, vocab_size).

--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -226,9 +226,11 @@ class SupportsPP(Protocol):
        intermediate_tensors: Optional["IntermediateTensors"],
    ) -> Union[Tensor, "IntermediateTensors"]:
        """
-        Accept {class}`IntermediateTensors` when PP rank > 0.
+        Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
+        PP rank > 0.
-        Return {class}`IntermediateTensors` only for the last PP rank.
+        Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only
+        for the last PP rank.
        """
        ...