Unverified Commit ccee371e authored by Hyogeun Oh (오효근)'s avatar Hyogeun Oh (오효근) Committed by GitHub
Browse files

[Docs] Fix warnings in `mkdocs build` (continued) (#24092)


Signed-off-by: default avatarZerohertz <ohg3417@gmail.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
parent c0bd6a68
......@@ -755,7 +755,7 @@ class FusedMoE(CustomOp):
intermediate_size: Intermediate size of the experts
params_dtype: Data type for the parameters.
reduce_results: Whether to all all_reduce on the output of the layer
renomalize: Whether to renormalize the logits in the fused_moe kernel
renormalize: Whether to renormalize the logits in the fused_moe kernel
quant_config: Quantization configure.
enable_eplb: Whether to enable expert parallelism load balancer.
"""
......
......@@ -420,9 +420,8 @@ def shuffle_weights(
Args:
*tensors: Variable number of torch.Tensor objects.
layout: A pair of integers specifying the
block sizes used to divide the tensors during shuffling.
Default is (16, 16).
layout: A pair of integers specifying the block sizes used to divide
the tensors during shuffling. Default is (16, 16).
Returns:
A Tuple of shuffled tensors.
......
......@@ -10,7 +10,7 @@ like uniform random routing.
"""
from abc import ABC, abstractmethod
from typing import Optional
from typing import Any, Optional
import torch
......@@ -50,7 +50,9 @@ class DistributionBasedRouting(RoutingStrategy):
distributions for testing different routing patterns.
"""
def __init__(self, distribution: str = "uniform", **distribution_params):
def __init__(self,
distribution: str = "uniform",
**distribution_params: Any):
"""
Initialize distribution-based routing.
......@@ -244,7 +246,7 @@ class RoutingSimulator:
cls._routing_strategies[name] = strategy
@classmethod
def get_available_strategies(cls):
def get_available_strategies(cls) -> list[str]:
"""
Get list of available routing strategy names.
......
......@@ -202,7 +202,7 @@ class BitBLASLinearMethod(LinearMethodBase):
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
) -> None:
"""Creates quantized weights for use in linear operations.
The function initializes and returns a dictionary containing quantized
......@@ -211,7 +211,7 @@ class BitBLASLinearMethod(LinearMethodBase):
Args:
input_size_per_partition: The size of the input partition.
output_size_per_partition: The size of the output partition.
output_partition_sizes: List of output partition sizes.
input_size: The total size of the input (unused).
output_size: The total size of the output (unused).
params_dtype:
......@@ -222,9 +222,9 @@ class BitBLASLinearMethod(LinearMethodBase):
scales ('scales'), and zeros ('zeros').
Raises:
ValueError: If `params_dtype` is not `torch.float16` or if the
input size per partition is not divisible by the group size in
`quant_config`.
ValueError: If `params_dtype` is not `torch.float16` or if the input
size per partition is not divisible by the group size
in `quant_config`.
"""
del input_size, output_size # Unused arguments.
weight_loader = extra_weight_attrs["weight_loader"]
......
......@@ -265,9 +265,9 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
scales ('scales'), and zeros ('zeros').
Raises:
ValueError: If `params_dtype` is not `torch.float16` or
if the input size per partition is not divisible by the
group size in `quant_config`.
ValueError: If `params_dtype` is not `torch.float16` or if the input
size per partition is not divisible by the group size
in `quant_config`.
"""
if params_dtype != torch.float16:
raise ValueError("Parameter data type must be torch.float16, "
......
......@@ -49,8 +49,8 @@ def choose_mp_linear_kernel(
config (MPLinearLayerConfig): Description of the linear layer to be
implemented.
compute_capability (Optional[int], optional): The compute capability of
the target device, if None uses `current_platform` to get the compute
capability. Defaults to None.
the target device, if None uses `current_platform` to get
the compute capability. Defaults to None.
Raises:
ValueError: If no kernel can implement the given config.
......
......@@ -7,7 +7,7 @@
#!/usr/bin/env python3
import abc
import math
from typing import Literal, Optional
from typing import Any, Literal, Optional, Union
import numpy as np
import torch
......@@ -131,31 +131,31 @@ class ConformerEncoderLayer(nn.Module):
def __init__(
self,
d_model=512,
ext_pw_out_channel=0,
depthwise_seperable_out_channel=256,
depthwise_multiplier=1,
n_head=4,
d_ffn=2048,
ext_pw_kernel_size=1,
kernel_size=3,
dropout_rate=0.1,
causal=False,
batch_norm=False,
activation="relu",
chunk_se=0,
chunk_size=18,
conv_activation="relu",
conv_glu_type="sigmoid",
bias_in_glu=True,
linear_glu_in_convm=False,
attention_inner_dim=-1,
attention_glu_type="swish",
activation_checkpointing="",
export=False,
use_pt_scaled_dot_product_attention=False,
d_model: int = 512,
ext_pw_out_channel: int = 0,
depthwise_seperable_out_channel: int = 256,
depthwise_multiplier: int = 1,
n_head: int = 4,
d_ffn: int = 2048,
ext_pw_kernel_size: int = 1,
kernel_size: int = 3,
dropout_rate: float = 0.1,
causal: bool = False,
batch_norm: bool = False,
activation: str = "relu",
chunk_se: int = 0,
chunk_size: int = 18,
conv_activation: str = "relu",
conv_glu_type: str = "sigmoid",
bias_in_glu: bool = True,
linear_glu_in_convm: bool = False,
attention_inner_dim: int = -1,
attention_glu_type: str = "swish",
activation_checkpointing: str = "",
export: bool = False,
use_pt_scaled_dot_product_attention: bool = False,
attn_group_sizes: int = 1,
):
) -> None:
super().__init__()
self.feed_forward_in = FeedForward(
......@@ -209,24 +209,21 @@ class ConformerEncoderLayer(nn.Module):
def forward(
self,
x,
pos_k,
pos_v,
mask,
x: torch.Tensor,
pos_k: torch.Tensor,
pos_v: torch.Tensor,
mask: torch.Tensor,
relative_attention_bias: Optional[Tensor] = None,
):
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""ConformerEncoder forward.
Args:
x: torch.Tensor
input feature of shape (batch, max_time_in, size)
pos_k: torch.Tensor
positional key embedding.
mask: torch.Tensor
mask for x (batch, max_time_in)
relative_attention_bias: Optional[torch.Tensor]
bias added to attention logits w.r.t. relative positions
(1, n_head, time1, time2)
x: input feature of shape (batch, max_time_in, size)
pos_k: positional key embedding.
pos_v: positional value embedding.
mask: mask for x (batch, max_time_in)
relative_attention_bias: bias added to attention logits w.r.t.
relative positions (1, n_head, time1, time2)
"""
x = x + 0.5 * self.feed_forward_in(x)
norm_x = self.layer_norm_att(x)
......@@ -323,25 +320,25 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
def __init__(
self,
input_size,
chunk_size,
left_chunk,
attention_dim=256,
attention_heads=4,
input_layer="nemo_conv",
cnn_out=-1,
cnn_layer_norm=False,
time_reduction=4,
dropout_rate=0.0,
padding_idx=-1,
relative_attention_bias_args=None,
positional_dropout_rate=0.0,
nemo_conv_settings=None,
input_size: int,
chunk_size: Union[int, list[int]],
left_chunk: Union[int, list[int]],
attention_dim: int = 256,
attention_heads: int = 4,
input_layer: str = "nemo_conv",
cnn_out: int = -1,
cnn_layer_norm: bool = False,
time_reduction: int = 4,
dropout_rate: float = 0.0,
padding_idx: int = -1,
relative_attention_bias_args: Optional[dict[str, Any]] = None,
positional_dropout_rate: float = 0.0,
nemo_conv_settings: Optional[dict[str, Any]] = None,
conv2d_extra_padding: Literal["feat", "feat_time", "none",
True] = "none",
attention_group_size=1,
encoder_embedding_config=None,
):
attention_group_size: int = 1,
encoder_embedding_config: Optional[dict[str, Any]] = None,
) -> None:
super().__init__()
self.input_size = input_size
self.input_layer = input_layer
......@@ -399,7 +396,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
self.encoder_embedding = MeanVarianceNormLayer(
self.encoder_embedding_config["input_size"])
def compute_lens_change(self, feature_lens):
def compute_lens_change(
self,
feature_lens: Union[int,
torch.Tensor]) -> Union[int, torch.Tensor]:
"""feature_lens: int
return updated feature lens.
......@@ -433,10 +433,14 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
return ceil_func(feature_lens / self.time_reduction)
@abc.abstractmethod
def forward(self):
def forward(self) -> Any:
"""Abstract forward method implementation."""
def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
def _chunk_size_selection(
self,
chunk_size: Optional[Union[int, list[int]]] = None,
left_chunk: Optional[Union[int,
list[int]]] = None) -> tuple[int, int]:
"""If chunk size is a list, we will randomly select a chunk size."""
if chunk_size is None:
......@@ -463,7 +467,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
return chunk_size_train_eff, left_chunk_train_eff
def _get_embed_class(self, embed):
def _get_embed_class(self, embed: nn.Module) -> nn.Module:
# pylint: disable=protected-access
is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
......@@ -474,13 +478,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
embed_class = embed.module
return embed_class
def _forward_embeddings_core(self, input_tensor, masks):
def _forward_embeddings_core(
self, input_tensor: torch.Tensor,
masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
embed_class = self._get_embed_class(self.embed)
assert isinstance(embed_class, NemoConvSubsampling)
input_tensor, masks = self.embed(input_tensor, masks)
return input_tensor, masks
def _position_embedding(self, input_tensor):
def _position_embedding(
self, input_tensor: torch.Tensor
) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
pos_k = None
pos_v = None
if self.relative_attention_bias_layer is None:
......@@ -488,7 +496,9 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
input_tensor) # default to add abs sinusoid embedding
return pos_k, pos_v
def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
def _streaming_mask(self, seq_len: int, batch_size: int,
chunk_size: Union[int, list[int]],
left_chunk: Union[int, list[int]]) -> torch.Tensor:
chunk_size_train_eff, left_chunk_train_eff = \
self._chunk_size_selection(chunk_size, left_chunk)
......@@ -502,11 +512,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
[batch_size, -1, -1]))
return enc_streaming_mask
def forward_embeddings(self,
xs_pad,
masks,
chunk_size_nc=None,
left_chunk_nc=None):
def forward_embeddings(
self,
xs_pad: torch.Tensor,
masks: torch.Tensor,
chunk_size_nc: Optional[Union[int, list[int]]] = None,
left_chunk_nc: Optional[Union[int, list[int]]] = None
) -> Union[tuple[torch.Tensor, Optional[torch.Tensor],
Optional[torch.Tensor], torch.Tensor, torch.Tensor],
tuple[torch.Tensor, Optional[torch.Tensor],
Optional[torch.Tensor], torch.Tensor, torch.Tensor,
torch.Tensor]]:
"""Forwarding the inputs through the top embedding layers
Args:
......@@ -569,7 +585,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
return input_tensor, pos_k, pos_v, hs_mask, masks
return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
def get_offset(self):
def get_offset(self) -> int:
"""Returns offset used when retaining inputs for decoding.
This is essentially, how many additional frames have to be added to
......@@ -605,8 +621,6 @@ class ConformerEncoder(TransformerEncoderBase):
Some examples for the 2 cases:
left_chunk = 6
left_chunk = [12, 9, 6, 3]
left_chunk: int
number of chunks used for masking in streaming mode.
num_lang: int
This parameter is used to store the number of languages in the
lang_dict, only used for multiseed/multilingual models.
......@@ -751,46 +765,46 @@ class ConformerEncoder(TransformerEncoderBase):
def __init__( # pylint: disable-all
self,
input_size,
chunk_size,
left_chunk,
num_lang=None,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
input_layer="nemo_conv",
causal=True,
batch_norm=False,
cnn_out=-1,
cnn_layer_norm=False,
ext_pw_out_channel=0,
ext_pw_kernel_size=1,
depthwise_seperable_out_channel=256,
depthwise_multiplier=1,
chunk_se=0,
kernel_size=3,
activation="relu",
conv_activation="relu",
conv_glu_type="sigmoid",
bias_in_glu=True,
linear_glu_in_convm=False,
attention_glu_type="swish",
export=False,
extra_layer_output_idx=-1,
extra_multi_layer_output_idxs=[], # noqa
activation_checkpointing="",
relative_attention_bias_args=None,
time_reduction=4,
use_pt_scaled_dot_product_attention=False,
nemo_conv_settings=None,
input_size: int,
chunk_size: Union[int, list[int]],
left_chunk: Union[int, list[int]],
num_lang: Optional[int] = None,
attention_dim: int = 256,
attention_heads: int = 4,
linear_units: int = 2048,
num_blocks: int = 6,
dropout_rate: float = 0.1,
input_layer: str = "nemo_conv",
causal: bool = True,
batch_norm: bool = False,
cnn_out: int = -1,
cnn_layer_norm: bool = False,
ext_pw_out_channel: int = 0,
ext_pw_kernel_size: int = 1,
depthwise_seperable_out_channel: int = 256,
depthwise_multiplier: int = 1,
chunk_se: int = 0,
kernel_size: int = 3,
activation: str = "relu",
conv_activation: str = "relu",
conv_glu_type: str = "sigmoid",
bias_in_glu: bool = True,
linear_glu_in_convm: bool = False,
attention_glu_type: str = "swish",
export: bool = False,
extra_layer_output_idx: int = -1,
extra_multi_layer_output_idxs: list[int] = [], # noqa
activation_checkpointing: str = "",
relative_attention_bias_args: Optional[dict[str, Any]] = None,
time_reduction: int = 4,
use_pt_scaled_dot_product_attention: bool = False,
nemo_conv_settings: Optional[dict[str, Any]] = None,
conv2d_extra_padding: Literal["feat", "feat_time", "none",
True] = "none",
replication_pad_for_subsample_embedding=False,
attention_group_size=1,
encoder_embedding_config=None,
):
replication_pad_for_subsample_embedding: bool = False,
attention_group_size: int = 1,
encoder_embedding_config: Optional[dict[str, Any]] = None,
) -> None:
super().__init__(
input_size,
chunk_size,
......@@ -852,11 +866,13 @@ class ConformerEncoder(TransformerEncoderBase):
# the device and the needed dtype:
self.register_buffer("dev_type", torch.zeros(()), persistent=False)
def init_relative_attention_bias(self, input_tensor):
def init_relative_attention_bias(
self, input_tensor: torch.Tensor) -> Optional[torch.Tensor]:
if self.relative_attention_bias_layer:
return self.relative_attention_bias_layer(input_tensor)
def calculate_hs_mask(self, xs_pad, device, mask):
def calculate_hs_mask(self, xs_pad: torch.Tensor, device: torch.device,
mask: Optional[torch.Tensor]) -> torch.Tensor:
max_audio_length = xs_pad.shape[1]
batch_size = xs_pad.shape[0]
enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
......@@ -877,7 +893,8 @@ class ConformerEncoder(TransformerEncoderBase):
return pad_mask
@torch.jit.ignore
def forward(self, xs_pad, masks):
def forward(self, xs_pad: torch.Tensor,
masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Conformer Forward function
Args:
......@@ -997,7 +1014,12 @@ class WindowQformer(nn.Module):
if normalize_before else None)
self.window_size = window_size
def forward(self, audio_embed, mask, embed_len=None):
def forward(
self,
audio_embed: torch.Tensor,
mask: Optional[torch.Tensor],
embed_len: Optional[int] = None
) -> tuple[torch.Tensor, Optional[int]]:
"""forward decoder"""
# audio_embed: N x T x D => N x D x T
......@@ -1042,7 +1064,7 @@ class WindowQformer(nn.Module):
class AudioEmbedding(nn.Module):
"""Image embedding."""
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
def __init__(self, config: PretrainedConfig, **kwargs: Any) -> None:
super().__init__()
self.config = config
# n_embed or hidden_size for text LM
......@@ -1148,19 +1170,18 @@ class AudioEmbedding(nn.Module):
self.input_embeds = None
self.audio_embed_sizes = None
def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
def set_audio_embeds(self, input_embeds: torch.Tensor) -> None:
self.input_embeds = input_embeds
def set_audio_embed_sizes(self,
audio_embed_sizes: torch.LongTensor) -> None:
def set_audio_embed_sizes(self, audio_embed_sizes: torch.Tensor) -> None:
self.audio_embed_sizes = audio_embed_sizes
def get_audio_features(
self,
input_embeds: torch.FloatTensor,
audio_attention_mask: torch.Tensor = None,
input_embeds: torch.Tensor,
audio_attention_mask: Optional[torch.Tensor] = None,
audio_projection_mode: str = "speech",
) -> torch.FloatTensor:
) -> torch.Tensor:
"""
arguments:
input_embeds: audio features (B, T, D) B: num audios in a sequence
......@@ -1214,10 +1235,10 @@ class AudioEmbedding(nn.Module):
def forward(
self,
audio_features: torch.FloatTensor,
audio_attention_mask: torch.Tensor = None,
audio_features: torch.Tensor,
audio_attention_mask: Optional[torch.Tensor] = None,
audio_projection_mode: str = "speech",
) -> torch.FloatTensor:
) -> torch.Tensor:
"""
arguments:
audio_features: audio features (T, D)
......
This diff is collapsed.
......@@ -1193,21 +1193,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
input_ids: Flattened (concatenated) input_ids corresponding to a
batch.
positions: Flattened (concatenated) position ids corresponding to a
batch.
**NOTE**: If mrope is enabled (default setting for Qwen2.5-VL
opensource models), the shape will be `(3, seq_len)`,
batch. **NOTE**: If mrope is enabled (default setting for
Qwen2.5-VL opensource models), the shape will be `(3, seq_len)`,
otherwise it will be `(seq_len,).
pixel_values: Pixel values to be fed to a model.
`None` if no images are passed.
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
`None` if no images are passed.
pixel_values_videos: Pixel values of videos to be fed to a model.
`None` if no videos are passed.
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed.
second_per_grid_ts: Tensor `(num_videos)` of video time interval (
in seconds) for each grid along the temporal dimension in the
3D position IDs. `None` if no videos are passed.
"""
if intermediate_tensors is not None:
......
......@@ -9,7 +9,7 @@ model alternates between state space model layers and attention-based layers.
"""
from collections.abc import Iterable
from itertools import cycle
from typing import Optional, Union
from typing import Any, Optional, Union
import torch
from torch import nn
......@@ -528,8 +528,6 @@ class Zamba2MambaDecoderLayer(nn.Module):
hidden_states: Input tensor [batch_size, seq_len, hidden_size]
mamba_cache_params: Parameters for Mamba's state caches
(one for conv, one for ssm)
sequence_idx: Index tensor for identifying sequences in batch
Required for proper chunked processing in prefill
transformer_hidden_states: Optional output from transformer path
Added to input if provided (used in hybrid architecture)
positions: Optional position IDs (unused in Mamba)
......@@ -591,8 +589,6 @@ class Zamba2HybridLayer(nn.Module):
Args:
shared_transformer: Transformer decoder layer for attention pathway
linear: Linear projection for transformer output before Mamba
mamba: Mamba decoder layer for state space pathway
"""
super().__init__()
self.block_idx = block_idx
......@@ -630,8 +626,6 @@ class Zamba2HybridLayer(nn.Module):
positions: Position IDs for positional embeddings
mamba_cache_params: Parameters for Mamba's state caches
(one for conv, one for ssm)
sequence_idx: Indices for identifying sequences in batch,
required for proper chunked processing in prefill
Returns:
Output tensor combining transformer and Mamba representations
......@@ -915,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
prefix: Optional prefix for parameter names
Raises:
AssertionError: If prefix caching is enabled (not supported by
Mamba)
AssertionError: If prefix caching is enabled
(not supported by Mamba)
"""
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
......@@ -971,7 +965,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
input_ids: torch.Tensor,
positions: torch.Tensor,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs) -> torch.Tensor:
**kwargs: Any) -> torch.Tensor:
"""Forward pass through the model.
Args:
......@@ -1012,9 +1006,9 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
return hidden_states
def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str,
torch.Tensor],
**kwargs) -> dict[str, torch.Tensor]:
def copy_inputs_before_cuda_graphs(
self, input_buffers: dict[str, torch.Tensor],
**kwargs: Any) -> dict[str, torch.Tensor]:
"""Copy inputs before CUDA graph capture.
Args:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment