Unverified Commit 9a8966bc authored by Hyogeun Oh (오효근)'s avatar Hyogeun Oh (오효근) Committed by GitHub
Browse files

[Docs] Fix warnings in mkdocs build (continued) (#24791)


Signed-off-by: default avatarZerohertz <ohg3417@gmail.com>
parent 5febdc87
...@@ -337,11 +337,11 @@ class EplbState: ...@@ -337,11 +337,11 @@ class EplbState:
Args: Args:
model (MixtureOfExperts): The MoE model. model (MixtureOfExperts): The MoE model.
is_dummy (bool): If `True`, this is a dummy step and the load is_dummy (bool): If `True`, this is a dummy step and the load
metrics recorded in this forward pass will not count. Defaults metrics recorded in this forward pass will not count. Defaults
to `False`. to `False`.
is_profile (bool): If `True`, perform a dummy rearrangement is_profile (bool): If `True`, perform a dummy rearrangement
with maximum communication cost. This is used in `profile_run` with maximum communication cost. This is used in `profile_run`
to reserve enough memory for the communication buffer. to reserve enough memory for the communication buffer.
log_stats (bool): If `True`, log the expert load metrics. log_stats (bool): If `True`, log the expert load metrics.
# Stats # Stats
......
...@@ -102,14 +102,14 @@ def rebalance_experts_hierarchical( ...@@ -102,14 +102,14 @@ def rebalance_experts_hierarchical(
num_groups: int, num_groups: int,
num_nodes: int, num_nodes: int,
num_gpus: int, num_gpus: int,
): ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
""" """
Parameters: Parameters:
weight: [num_moe_layers, num_logical_experts] weight: [num_moe_layers, num_logical_experts]
num_physical_experts: number of physical experts after replication num_physical_experts: number of physical experts after replication
num_groups: number of expert groups num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network num_nodes: number of server nodes, where the intra-node network
(e.g, NVLink) is faster (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes` num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns: Returns:
......
...@@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC): ...@@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC):
@abstractmethod @abstractmethod
def start_load_kv(self, forward_context: "ForwardContext", def start_load_kv(self, forward_context: "ForwardContext",
**kwargs) -> None: **kwargs: Any) -> None:
""" """
Start loading the KV cache from the connector to vLLM's paged Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the KV buffer. This is called from the forward context before the
...@@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC): ...@@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC):
@abstractmethod @abstractmethod
def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
attn_metadata: "AttentionMetadata", **kwargs) -> None: attn_metadata: "AttentionMetadata",
**kwargs: Any) -> None:
""" """
Start saving a layer of KV cache from vLLM's paged buffer Start saving a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to to the connector. This is called from within attention layer to
......
...@@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): ...@@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
# Worker-side methods # Worker-side methods
# ============================== # ==============================
def start_load_kv(self, forward_context: "ForwardContext", def start_load_kv(self, forward_context: "ForwardContext",
**kwargs) -> None: **kwargs: Any) -> None:
""" """
Start loading the KV cache from the connector to vLLM's paged Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the KV buffer. This is called from the forward context before the
...@@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1): ...@@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
self._lmcache_engine.wait_for_layer_load(layer_name) self._lmcache_engine.wait_for_layer_load(layer_name)
def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
attn_metadata: "AttentionMetadata", **kwargs) -> None: attn_metadata: "AttentionMetadata",
**kwargs: Any) -> None:
""" """
Start saving the a layer of KV cache from vLLM's paged buffer Start saving the a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to to the connector. This is called from within attention layer to
......
...@@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
# ============================== # ==============================
def start_load_kv(self, forward_context: "ForwardContext", def start_load_kv(self, forward_context: "ForwardContext",
**kwargs) -> None: **kwargs: Any) -> None:
"""Start loading the KV cache from the connector buffer to vLLM's """Start loading the KV cache from the connector buffer to vLLM's
paged KV buffer. paged KV buffer.
...@@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
return return
def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
attn_metadata: "AttentionMetadata", **kwargs) -> None: attn_metadata: "AttentionMetadata",
**kwargs: Any) -> None:
"""Start saving the KV cache of the layer from vLLM's paged buffer """Start saving the KV cache of the layer from vLLM's paged buffer
to the connector. to the connector.
...@@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
def get_finished( def get_finished(
self, finished_req_ids: set[str], self, finished_req_ids: set[str],
**kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]:
""" """
Notifies worker-side connector ids of requests that have Notifies worker-side connector ids of requests that have
finished generating tokens. finished generating tokens.
......
...@@ -218,8 +218,9 @@ class TensorMemoryPool: ...@@ -218,8 +218,9 @@ class TensorMemoryPool:
return addr return addr
def load_tensor(self, addr: int, dtype: torch.dtype, def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int,
shape: tuple[int, ...], device) -> torch.Tensor: ...],
device: torch.device) -> torch.Tensor:
"""Loads a tensor from pinned host memory to the specified device. """Loads a tensor from pinned host memory to the specified device.
Args: Args:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import hashlib import hashlib
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Any, Optional
import safetensors import safetensors
import torch import torch
...@@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1): ...@@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
logger.info("Shared storage path is %s", self._storage_path) logger.info("Shared storage path is %s", self._storage_path)
def start_load_kv(self, forward_context: "ForwardContext", def start_load_kv(self, forward_context: "ForwardContext",
**kwargs) -> None: **kwargs: Any) -> None:
"""Start loading the KV cache from the connector buffer to vLLM's """Start loading the KV cache from the connector buffer to vLLM's
paged KV buffer. paged KV buffer.
...@@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1): ...@@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
return return
def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
attn_metadata: "AttentionMetadata", **kwargs) -> None: attn_metadata: "AttentionMetadata",
**kwargs: Any) -> None:
"""Start saving the KV cache of the layer from vLLM's paged buffer """Start saving the KV cache of the layer from vLLM's paged buffer
to the connector. to the connector.
......
...@@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase): ...@@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase):
""" """
Receives a tensor and its metadata from the source rank. Blocking call. Receives a tensor and its metadata from the source rank. Blocking call.
Args: Returns:
tensor: The received tensor, or `None` if no tensor is received. The received tensor, or `None` if no tensor is received.
""" """
if self.transport_thread is None: if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1) self.transport_thread = ThreadPoolExecutor(max_workers=1)
......
...@@ -823,7 +823,7 @@ class SupportsEagle3(Protocol): ...@@ -823,7 +823,7 @@ class SupportsEagle3(Protocol):
Args: Args:
layers: Tuple of layer indices that should output auxiliary layers: Tuple of layer indices that should output auxiliary
hidden states. hidden states.
""" """
... ...
......
...@@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module): ...@@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module):
batch. batch.
**NOTE**: If mrope is enabled (default setting for Qwen2-VL **NOTE**: If mrope is enabled (default setting for Qwen2-VL
opensource models), the shape will be `(3, seq_len)`, opensource models), the shape will be `(3, seq_len)`,
otherwise it will be `(seq_len,). otherwise it will be `(seq_len,)`.
pixel_values: Pixel values to be fed to a model. intermediate_tensors: Intermediate tensors from prior forward pass.
`None` if no images are passed. inputs_embeds: Optional tensor of input embeddings.
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
`None` if no images are passed.
pixel_values_videos: Pixel values of videos to be fed to a model.
`None` if no videos are passed.
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed.
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor: ...@@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0) return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)
def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int], def get_num_patches(grid_thw: torch.Tensor,
torch.Tensor]): num_frames: Union[list[int], torch.Tensor]) -> list[int]:
""" """
Return num_patches per video. Return num_patches per video.
Args: Args:
t: tensor with shape [N, ...] where each item is a list/tensor grid_thw: Tensor with shape [N, 3] containing temporal, height, width
cu_seqlens: list indicating the boundaries of groups dimensions
num_frames: List or tensor indicating the number of frames per video
Returns: Returns:
list of ints representing the sum of products for each group List of ints representing the number of patches for each video
Examples: Examples:
>>> # Suppose there are 2 videos with a total of 3 grids >>> # Suppose there are 2 videos with a total of 3 grids
......
...@@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
Args: Args:
input_ids: Flattened (concatenated) input_ids corresponding to a input_ids: Flattened (concatenated) input_ids corresponding to a
batch. batch.
pixel_values: The pixels in each input image. positions: Position indices for the input tokens.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
Info: Info:
[LlavaImageInputs][] [LlavaImageInputs][]
......
...@@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
Args: Args:
input_ids: Flattened (concatenated) input_ids corresponding to a input_ids: Flattened (concatenated) input_ids corresponding to a
batch. batch.
pixel_values: The pixels in each grid patch for each input image. positions: Position indices for the input tokens.
image_sizes: The original `(height, width)` for each input image. intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
Info: Info:
[LlavaNextImageInputs][] [LlavaNextImageInputs][]
......
...@@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, ...@@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
Args: Args:
input_ids: Flattened (concatenated) input_ids corresponding to a input_ids: Flattened (concatenated) input_ids corresponding to a
batch. batch.
pixel_values: The pixels in each input image. positions: Position indices for the input tokens.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
Info: Info:
[Mistral3ImagePixelInputs][] [Mistral3ImagePixelInputs][]
......
...@@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module): ...@@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module):
) -> torch.Tensor: ) -> torch.Tensor:
r""" r"""
Args: Args:
inputs_embeds (`torch.FloatTensor` of shape hidden_states: Input tensor of shape
`(batch_size, sequence_length, hidden_size)`): (batch_size, sequence_length, hidden_size).
Optionally, instead of passing `input_ids` you can choose to Hidden states from the model embeddings, representing
directly pass an embedded representation. This is useful if you the input tokens.
want more control over how to convert `input_ids` indices into
associated vectors than the model's internal embedding associated vectors than the model's internal embedding
lookup matrix. lookup matrix.
""" """
......
...@@ -70,11 +70,15 @@ def multihead_attention( ...@@ -70,11 +70,15 @@ def multihead_attention(
v: torch.Tensor, v: torch.Tensor,
q_cu_seqlens: Optional[torch.Tensor] = None, q_cu_seqlens: Optional[torch.Tensor] = None,
k_cu_seqlens: Optional[torch.Tensor] = None, k_cu_seqlens: Optional[torch.Tensor] = None,
): ) -> torch.Tensor:
"""Multi-head attention using flash attention 2. """Multi-head attention using flash attention 2.
Args: Args:
q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim), q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing. or (tot_seqlens, num_heads, head_dim) if packing.
q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q. q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
The first element should be 0 and the last element should be q.shape[0]. The first element should be 0 and the last element should be q.shape[0].
...@@ -123,8 +127,14 @@ def sdpa_attention( ...@@ -123,8 +127,14 @@ def sdpa_attention(
"""SDPA attention. """SDPA attention.
Args: Args:
q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim), q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing. or (tot_seqlens, num_heads, head_dim) if packing.
q_cu_seqlens: Optional cumulative sequence lengths of q.
k_cu_seqlens: Optional cumulative sequence lengths of k.
""" """
seq_length = q.shape[0] seq_length = q.shape[0]
attention_mask = torch.zeros([1, seq_length, seq_length], attention_mask = torch.zeros([1, seq_length, seq_length],
...@@ -387,7 +397,7 @@ class MLP2(nn.Module): ...@@ -387,7 +397,7 @@ class MLP2(nn.Module):
def __init__(self, def __init__(self,
dims: list[int], dims: list[int],
activation, activation,
bias=True, bias: bool = True,
prefix: str = "", prefix: str = "",
use_data_parallel: bool = False): use_data_parallel: bool = False):
super().__init__() super().__init__()
......
...@@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module): ...@@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
Typically used as a very first layer in a model. Typically used as a very first layer in a model.
Args: Args:
input_size: int config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig)
layer input size. object containing model parameters.
""" """
def __init__(self, config: Phi4MultimodalAudioConfig): def __init__(self, config: Phi4MultimodalAudioConfig):
......
...@@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
batch. batch.
**NOTE**: If mrope is enabled (default setting for Qwen2-VL **NOTE**: If mrope is enabled (default setting for Qwen2-VL
opensource models), the shape will be `(3, seq_len)`, opensource models), the shape will be `(3, seq_len)`,
otherwise it will be `(seq_len,). otherwise it will be `(seq_len,)`.
pixel_values: Pixel values to be fed to a model. intermediate_tensors: Intermediate tensors from prior forward pass.
`None` if no images are passed. inputs_embeds: Optional tensor of input embeddings.
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
`None` if no images are passed.
pixel_values_videos: Pixel values of videos to be fed to a model.
`None` if no videos are passed.
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed.
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
......
...@@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module): ...@@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module):
position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]: position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
""" """
Args: Args:
hidden_states (`torch.FloatTensor`): hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
Input to the layer of shape `(batch, seq_len, embed_dim)`. cu_seqlens: Cumulative sequence lengths tensor.
output_attentions (`bool`, *optional*, defaults to `False`): position_embeddings: Position embeddings tensor.
Whether or not to return the attentions tensors of all
attention layers. See `attentions` under
returned tensors for more detail.
""" """
residual = hidden_states residual = hidden_states
...@@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module): ...@@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module):
) -> torch.Tensor: ) -> torch.Tensor:
r""" r"""
Args: Args:
inputs_embeds (`torch.FloatTensor` of shape inputs_embeds: Input tensor of shape
`(batch_size, sequence_length, hidden_size)`): (batch_size, sequence_length, hidden_size).
Optionally, instead of passing `input_ids` you can choose to Embedded representation of the input tokens.
directly pass an embedded representation. This is useful if grid_thws: Grid tensor of shape (num_patches, 3)
you want more control over how to convert `input_ids` indices containing grid dimensions.
into associated vectors than the model's internal embedding
lookup matrix.
grid_thws (`torch.LongTensor`):
grid shape (num_patches, 3)
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See
`hidden_states` under returned tensors for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of Whether or not to return a [`~utils.ModelOutput`] instead of
a plain tuple. a plain tuple.
""" """
......
...@@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): ...@@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
with the `input_ids`. with the `input_ids`.
Args: Args:
audio_features: A batch of audio input chunks [B, N, 80, M]. input_ids: Flattened (concatenated) input_ids corresponding to a
audio_lens: Length of audio frames for each audio chunk [B]. batch.
audio_token_len: Length of audio tokens for each audio chunk [B']. positions: Position indices for the input tokens.
Note: batch dim is different from batch dim in audio chunks. intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment