[Docs] Fix warnings in mkdocs build (continued) (#24791)

Signed-off-by: Zerohertz <ohg3417@gmail.com>

[Docs] Fix warnings in mkdocs build (continued) (#24791)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
9a8966bc · Hyogeun Oh (오효근) · GitHub · 5febdc87 · 9a8966bc · 9a8966bc
Unverified Commit 9a8966bc authored Sep 13, 2025 by Hyogeun Oh (오효근) Committed by GitHub Sep 13, 2025
20 changed files
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -337,11 +337,11 @@ class EplbState:
        Args:
            model (MixtureOfExperts): The MoE model.
            is_dummy (bool): If `True`, this is a dummy step and the load
-              metrics recorded in this forward pass will not count. Defaults
+                metrics recorded in this forward pass will not count. Defaults
-              to `False`.
+                to `False`.
            is_profile (bool): If `True`, perform a dummy rearrangement
-              with maximum communication cost. This is used in `profile_run`
+                with maximum communication cost. This is used in `profile_run`
-              to reserve enough memory for the communication buffer.
+                to reserve enough memory for the communication buffer.
            log_stats (bool): If `True`, log the expert load metrics.
        # Stats

--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -102,14 +102,14 @@ def rebalance_experts_hierarchical(
    num_groups: int,
    num_nodes: int,
    num_gpus: int,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Parameters:
        weight: [num_moe_layers, num_logical_experts]
        num_physical_experts: number of physical experts after replication
        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
+        num_nodes: number of server nodes, where the intra-node network 
-        (e.g, NVLink) is faster
+            (e.g, NVLink) is faster
        num_gpus: number of GPUs, must be a multiple of `num_nodes`
    Returns:

--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC):
    @abstractmethod
    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """
        Start loading the KV cache from the connector to vLLM's paged
        KV buffer. This is called from the forward context before the
@@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC):
    @abstractmethod
    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """
        Start saving a layer of KV cache from vLLM's paged buffer 
        to the connector. This is called from within attention layer to

--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
    # Worker-side methods
    # ==============================
    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """
        Start loading the KV cache from the connector to vLLM's paged
        KV buffer. This is called from the forward context before the
@@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
        self._lmcache_engine.wait_for_layer_load(layer_name)
    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """
        Start saving the a layer of KV cache from vLLM's paged buffer 
        to the connector. This is called from within attention layer to

--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
    # ==============================
    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """Start loading the KV cache from the connector buffer to vLLM's
        paged KV buffer.
@@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
        return
    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """Start saving the KV cache of the layer from vLLM's paged buffer
        to the connector.
@@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
    def get_finished(
            self, finished_req_ids: set[str],
-            **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]:
+            **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]:
        """
        Notifies worker-side connector ids of requests that have
        finished generating tokens.

--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -218,8 +218,9 @@ class TensorMemoryPool:
        return addr
-    def load_tensor(self, addr: int, dtype: torch.dtype,
+    def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int,
-                    shape: tuple[int, ...], device) -> torch.Tensor:
+                                                                      ...],
+                    device: torch.device) -> torch.Tensor:
        """Loads a tensor from pinned host memory to the specified device.
        Args:

--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional
 import safetensors
 import torch
@@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
        logger.info("Shared storage path is %s", self._storage_path)
    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """Start loading the KV cache from the connector buffer to vLLM's 
        paged KV buffer.
@@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
        return
    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """Start saving the KV cache of the layer from vLLM's paged buffer 
        to the connector.

--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase):
        """
        Receives a tensor and its metadata from the source rank. Blocking call.
-        Args:
+        Returns:
-            tensor: The received tensor, or `None` if no tensor is received.
+            The received tensor, or `None` if no tensor is received.
        """
        if self.transport_thread is None:
            self.transport_thread = ThreadPoolExecutor(max_workers=1)

--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -823,7 +823,7 @@ class SupportsEagle3(Protocol):
        Args:
            layers: Tuple of layer indices that should output auxiliary
-              hidden states.
+                hidden states.
        """
        ...

--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module):
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
+                otherwise it will be `(seq_len,)`.
-            pixel_values: Pixel values to be fed to a model.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
-                `None` if no images are passed.
+            inputs_embeds: Optional tensor of input embeddings.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
    return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)
-def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int],
+def get_num_patches(grid_thw: torch.Tensor,
-                                                              torch.Tensor]):
+                    num_frames: Union[list[int], torch.Tensor]) -> list[int]:
    """
    Return num_patches per video.
    Args:
-        t: tensor with shape [N, ...] where each item is a list/tensor
+        grid_thw: Tensor with shape [N, 3] containing temporal, height, width
-        cu_seqlens: list indicating the boundaries of groups
+            dimensions
+        num_frames: List or tensor indicating the number of frames per video
    Returns:
-        list of ints representing the sum of products for each group
+        List of ints representing the number of patches for each video
    Examples:
        >>> # Suppose there are 2 videos with a total of 3 grids

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        Info:
            [LlavaImageInputs][]

--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each grid patch for each input image.
+            positions: Position indices for the input tokens.
-            image_sizes: The original `(height, width)` for each input image.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        Info:
            [LlavaNextImageInputs][]

--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        Info:
            [Mistral3ImagePixelInputs][]

--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module):
    ) -> torch.Tensor:
        r"""
        Args:
-            inputs_embeds (`torch.FloatTensor` of shape
+            hidden_states: Input tensor of shape 
-                    `(batch_size, sequence_length, hidden_size)`):
+                (batch_size, sequence_length, hidden_size).
-                Optionally, instead of passing `input_ids` you can choose to
+                Hidden states from the model embeddings, representing 
-                directly pass an embedded representation. This is useful if you
+                the input tokens.
-                want more control over how to convert `input_ids` indices into
                associated vectors than the model's internal embedding
                lookup matrix.
        """

--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -70,11 +70,15 @@ def multihead_attention(
    v: torch.Tensor,
    q_cu_seqlens: Optional[torch.Tensor] = None,
    k_cu_seqlens: Optional[torch.Tensor] = None,
-):
+) -> torch.Tensor:
    """Multi-head attention using flash attention 2.
    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
            The first element should be 0 and the last element should be q.shape[0].
@@ -123,8 +127,14 @@ def sdpa_attention(
    """SDPA attention.
    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens: Optional cumulative sequence lengths of q.
+        k_cu_seqlens: Optional cumulative sequence lengths of k.
    """
    seq_length = q.shape[0]
    attention_mask = torch.zeros([1, seq_length, seq_length],
@@ -387,7 +397,7 @@ class MLP2(nn.Module):
    def __init__(self,
                 dims: list[int],
                 activation,
-                 bias=True,
+                 bias: bool = True,
                 prefix: str = "",
                 use_data_parallel: bool = False):
        super().__init__()

--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
    Typically used as a very first layer in a model.
    Args:
-        input_size: int
+        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) 
-            layer input size.
+            object containing model parameters.
    """
    def __init__(self, config: Phi4MultimodalAudioConfig):

--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
+                otherwise it will be `(seq_len,)`.
-            pixel_values: Pixel values to be fed to a model.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
-                `None` if no images are passed.
+            inputs_embeds: Optional tensor of input embeddings.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
        """
        if intermediate_tensors is not None:

--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module):
                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
        """
        Args:
-            hidden_states (`torch.FloatTensor`):
+            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            cu_seqlens: Cumulative sequence lengths tensor.
-            output_attentions (`bool`, *optional*, defaults to `False`):
+            position_embeddings: Position embeddings tensor.
-                Whether or not to return the attentions tensors of all 
-                attention layers. See `attentions` under
-                returned tensors for more detail.
        """
        residual = hidden_states
@@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module):
    ) -> torch.Tensor:
        r"""
        Args:
-            inputs_embeds (`torch.FloatTensor` of shape
+            inputs_embeds: Input tensor of shape 
-                `(batch_size, sequence_length, hidden_size)`):
+                (batch_size, sequence_length, hidden_size).
-                Optionally, instead of passing `input_ids` you can choose to
+                Embedded representation of the input tokens.
-                directly pass an embedded representation. This is useful if
+            grid_thws: Grid tensor of shape (num_patches, 3) 
-                you want more control over how to convert `input_ids` indices
+                containing grid dimensions.
-                into associated vectors than the model's internal embedding
-                lookup matrix.
-            grid_thws (`torch.LongTensor`):
-                grid shape (num_patches, 3)
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See
-                `hidden_states` under returned tensors for more detail.
-            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of
                a plain tuple.
        """

--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
        with the `input_ids`.
        Args:
-            audio_features: A batch of audio input chunks [B, N, 80, M].
+            input_ids: Flattened (concatenated) input_ids corresponding to a
-            audio_lens: Length of audio frames for each audio chunk [B].
+                batch.
-            audio_token_len: Length of audio tokens for each audio chunk [B'].
+            positions: Position indices for the input tokens.
-                Note: batch dim is different from batch dim in audio chunks.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        """