Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9a8966bc
Unverified
Commit
9a8966bc
authored
Sep 13, 2025
by
Hyogeun Oh (오효근)
Committed by
GitHub
Sep 13, 2025
Browse files
[Docs] Fix warnings in mkdocs build (continued) (#24791)
Signed-off-by:
Zerohertz
<
ohg3417@gmail.com
>
parent
5febdc87
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
81 additions
and
83 deletions
+81
-83
vllm/distributed/eplb/eplb_state.py
vllm/distributed/eplb/eplb_state.py
+4
-4
vllm/distributed/eplb/rebalance_algo.py
vllm/distributed/eplb/rebalance_algo.py
+3
-3
vllm/distributed/kv_transfer/kv_connector/v1/base.py
vllm/distributed/kv_transfer/kv_connector/v1/base.py
+3
-2
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
...tributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+3
-2
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+4
-3
vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
...ted/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+3
-2
vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
...d/kv_transfer/kv_connector/v1/shared_storage_connector.py
+4
-3
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+2
-2
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces.py
+1
-1
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+3
-9
vllm/model_executor/models/keye_vl1_5.py
vllm/model_executor/models/keye_vl1_5.py
+6
-5
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+3
-1
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+3
-2
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral3.py
+3
-1
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mllama4.py
+4
-5
vllm/model_executor/models/moonvit.py
vllm/model_executor/models/moonvit.py
+14
-4
vllm/model_executor/models/phi4_multimodal.py
vllm/model_executor/models/phi4_multimodal.py
+2
-2
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+3
-9
vllm/model_executor/models/siglip2navit.py
vllm/model_executor/models/siglip2navit.py
+8
-19
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+5
-4
No files found.
vllm/distributed/eplb/eplb_state.py
View file @
9a8966bc
...
...
@@ -337,11 +337,11 @@ class EplbState:
Args:
model (MixtureOfExperts): The MoE model.
is_dummy (bool): If `True`, this is a dummy step and the load
metrics recorded in this forward pass will not count. Defaults
to `False`.
metrics recorded in this forward pass will not count. Defaults
to `False`.
is_profile (bool): If `True`, perform a dummy rearrangement
with maximum communication cost. This is used in `profile_run`
to reserve enough memory for the communication buffer.
with maximum communication cost. This is used in `profile_run`
to reserve enough memory for the communication buffer.
log_stats (bool): If `True`, log the expert load metrics.
# Stats
...
...
vllm/distributed/eplb/rebalance_algo.py
View file @
9a8966bc
...
...
@@ -102,14 +102,14 @@ def rebalance_experts_hierarchical(
num_groups
:
int
,
num_nodes
:
int
,
num_gpus
:
int
,
):
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]
:
"""
Parameters:
weight: [num_moe_layers, num_logical_experts]
num_physical_experts: number of physical experts after replication
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network
(e.g, NVLink) is faster
num_nodes: number of server nodes, where the intra-node network
(e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
...
...
vllm/distributed/kv_transfer/kv_connector/v1/base.py
View file @
9a8966bc
...
...
@@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC):
@
abstractmethod
def
start_load_kv
(
self
,
forward_context
:
"ForwardContext"
,
**
kwargs
)
->
None
:
**
kwargs
:
Any
)
->
None
:
"""
Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the
...
...
@@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC):
@
abstractmethod
def
save_kv_layer
(
self
,
layer_name
:
str
,
kv_layer
:
torch
.
Tensor
,
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
)
->
None
:
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
:
Any
)
->
None
:
"""
Start saving a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to
...
...
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
View file @
9a8966bc
...
...
@@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
# Worker-side methods
# ==============================
def
start_load_kv
(
self
,
forward_context
:
"ForwardContext"
,
**
kwargs
)
->
None
:
**
kwargs
:
Any
)
->
None
:
"""
Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the
...
...
@@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
self
.
_lmcache_engine
.
wait_for_layer_load
(
layer_name
)
def
save_kv_layer
(
self
,
layer_name
:
str
,
kv_layer
:
torch
.
Tensor
,
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
)
->
None
:
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
:
Any
)
->
None
:
"""
Start saving the a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
9a8966bc
...
...
@@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
# ==============================
def
start_load_kv
(
self
,
forward_context
:
"ForwardContext"
,
**
kwargs
)
->
None
:
**
kwargs
:
Any
)
->
None
:
"""Start loading the KV cache from the connector buffer to vLLM's
paged KV buffer.
...
...
@@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
return
def
save_kv_layer
(
self
,
layer_name
:
str
,
kv_layer
:
torch
.
Tensor
,
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
)
->
None
:
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
:
Any
)
->
None
:
"""Start saving the KV cache of the layer from vLLM's paged buffer
to the connector.
...
...
@@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
def
get_finished
(
self
,
finished_req_ids
:
set
[
str
],
**
kwargs
)
->
tuple
[
Optional
[
set
[
str
]],
Optional
[
set
[
str
]]]:
**
kwargs
:
Any
)
->
tuple
[
Optional
[
set
[
str
]],
Optional
[
set
[
str
]]]:
"""
Notifies worker-side connector ids of requests that have
finished generating tokens.
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
View file @
9a8966bc
...
...
@@ -218,8 +218,9 @@ class TensorMemoryPool:
return
addr
def
load_tensor
(
self
,
addr
:
int
,
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
...],
device
)
->
torch
.
Tensor
:
def
load_tensor
(
self
,
addr
:
int
,
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
...],
device
:
torch
.
device
)
->
torch
.
Tensor
:
"""Loads a tensor from pinned host memory to the specified device.
Args:
...
...
vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
View file @
9a8966bc
...
...
@@ -3,7 +3,7 @@
import
hashlib
import
os
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Optional
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
import
safetensors
import
torch
...
...
@@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
logger
.
info
(
"Shared storage path is %s"
,
self
.
_storage_path
)
def
start_load_kv
(
self
,
forward_context
:
"ForwardContext"
,
**
kwargs
)
->
None
:
**
kwargs
:
Any
)
->
None
:
"""Start loading the KV cache from the connector buffer to vLLM's
paged KV buffer.
...
...
@@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
return
def
save_kv_layer
(
self
,
layer_name
:
str
,
kv_layer
:
torch
.
Tensor
,
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
)
->
None
:
attn_metadata
:
"AttentionMetadata"
,
**
kwargs
:
Any
)
->
None
:
"""Start saving the KV cache of the layer from vLLM's paged buffer
to the connector.
...
...
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
View file @
9a8966bc
...
...
@@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase):
"""
Receives a tensor and its metadata from the source rank. Blocking call.
Arg
s:
tensor:
The received tensor, or `None` if no tensor is received.
Return
s:
The received tensor, or `None` if no tensor is received.
"""
if
self
.
transport_thread
is
None
:
self
.
transport_thread
=
ThreadPoolExecutor
(
max_workers
=
1
)
...
...
vllm/model_executor/models/interfaces.py
View file @
9a8966bc
...
...
@@ -823,7 +823,7 @@ class SupportsEagle3(Protocol):
Args:
layers: Tuple of layer indices that should output auxiliary
hidden states.
hidden states.
"""
...
...
...
vllm/model_executor/models/keye.py
View file @
9a8966bc
...
...
@@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module):
batch.
**NOTE**: If mrope is enabled (default setting for Qwen2-VL
opensource models), the shape will be `(3, seq_len)`,
otherwise it will be `(seq_len,).
pixel_values: Pixel values to be fed to a model.
`None` if no images are passed.
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
`None` if no images are passed.
pixel_values_videos: Pixel values of videos to be fed to a model.
`None` if no videos are passed.
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed.
otherwise it will be `(seq_len,)`.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
"""
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
...
...
vllm/model_executor/models/keye_vl1_5.py
View file @
9a8966bc
...
...
@@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
return
torch
.
cat
([
ones
,
h_w
],
dim
=
1
).
repeat_interleave
(
t
,
dim
=
0
)
def
get_num_patches
(
grid_thw
:
torch
.
Tensor
,
num_frames
:
Union
[
list
[
int
],
torch
.
Tensor
]):
def
get_num_patches
(
grid_thw
:
torch
.
Tensor
,
num_frames
:
Union
[
list
[
int
],
torch
.
Tensor
])
->
list
[
int
]
:
"""
Return num_patches per video.
Args:
t: tensor with shape [N, ...] where each item is a list/tensor
cu_seqlens: list indicating the boundaries of groups
grid_thw: Tensor with shape [N, 3] containing temporal, height, width
dimensions
num_frames: List or tensor indicating the number of frames per video
Returns:
l
ist of ints representing the
s
um of p
roduct
s for each
group
L
ist of ints representing the
n
um
ber
of p
atche
s for each
video
Examples:
>>> # Suppose there are 2 videos with a total of 3 grids
...
...
vllm/model_executor/models/llava.py
View file @
9a8966bc
...
...
@@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
Args:
input_ids: Flattened (concatenated) input_ids corresponding to a
batch.
pixel_values: The pixels in each input image.
positions: Position indices for the input tokens.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
Info:
[LlavaImageInputs][]
...
...
vllm/model_executor/models/llava_next.py
View file @
9a8966bc
...
...
@@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
Args:
input_ids: Flattened (concatenated) input_ids corresponding to a
batch.
pixel_values: The pixels in each grid patch for each input image.
image_sizes: The original `(height, width)` for each input image.
positions: Position indices for the input tokens.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
Info:
[LlavaNextImageInputs][]
...
...
vllm/model_executor/models/mistral3.py
View file @
9a8966bc
...
...
@@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
Args:
input_ids: Flattened (concatenated) input_ids corresponding to a
batch.
pixel_values: The pixels in each input image.
positions: Position indices for the input tokens.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
Info:
[Mistral3ImagePixelInputs][]
...
...
vllm/model_executor/models/mllama4.py
View file @
9a8966bc
...
...
@@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module):
)
->
torch
.
Tensor
:
r
"""
Args:
inputs_embeds (`torch.FloatTensor` of shape
`(batch_size, sequence_length, hidden_size)`):
Optionally, instead of passing `input_ids` you can choose to
directly pass an embedded representation. This is useful if you
want more control over how to convert `input_ids` indices into
hidden_states: Input tensor of shape
(batch_size, sequence_length, hidden_size).
Hidden states from the model embeddings, representing
the input tokens.
associated vectors than the model's internal embedding
lookup matrix.
"""
...
...
vllm/model_executor/models/moonvit.py
View file @
9a8966bc
...
...
@@ -70,11 +70,15 @@ def multihead_attention(
v
:
torch
.
Tensor
,
q_cu_seqlens
:
Optional
[
torch
.
Tensor
]
=
None
,
k_cu_seqlens
:
Optional
[
torch
.
Tensor
]
=
None
,
):
)
->
torch
.
Tensor
:
"""Multi-head attention using flash attention 2.
Args:
q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
The first element should be 0 and the last element should be q.shape[0].
...
...
@@ -123,8 +127,14 @@ def sdpa_attention(
"""SDPA attention.
Args:
q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
or (tot_seqlens, num_heads, head_dim) if packing.
q_cu_seqlens: Optional cumulative sequence lengths of q.
k_cu_seqlens: Optional cumulative sequence lengths of k.
"""
seq_length
=
q
.
shape
[
0
]
attention_mask
=
torch
.
zeros
([
1
,
seq_length
,
seq_length
],
...
...
@@ -387,7 +397,7 @@ class MLP2(nn.Module):
def
__init__
(
self
,
dims
:
list
[
int
],
activation
,
bias
=
True
,
bias
:
bool
=
True
,
prefix
:
str
=
""
,
use_data_parallel
:
bool
=
False
):
super
().
__init__
()
...
...
vllm/model_executor/models/phi4_multimodal.py
View file @
9a8966bc
...
...
@@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
Typically used as a very first layer in a model.
Args:
input_size: int
layer input size
.
config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig)
object containing model parameters
.
"""
def
__init__
(
self
,
config
:
Phi4MultimodalAudioConfig
):
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
9a8966bc
...
...
@@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
batch.
**NOTE**: If mrope is enabled (default setting for Qwen2-VL
opensource models), the shape will be `(3, seq_len)`,
otherwise it will be `(seq_len,).
pixel_values: Pixel values to be fed to a model.
`None` if no images are passed.
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
`None` if no images are passed.
pixel_values_videos: Pixel values of videos to be fed to a model.
`None` if no videos are passed.
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed.
otherwise it will be `(seq_len,)`.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
"""
if
intermediate_tensors
is
not
None
:
...
...
vllm/model_executor/models/siglip2navit.py
View file @
9a8966bc
...
...
@@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module):
position_embeddings
:
torch
.
Tensor
)
->
tuple
[
torch
.
FloatTensor
]:
"""
Args:
hidden_states (`torch.FloatTensor`):
Input to the layer of shape `(batch, seq_len, embed_dim)`.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all
attention layers. See `attentions` under
returned tensors for more detail.
hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
cu_seqlens: Cumulative sequence lengths tensor.
position_embeddings: Position embeddings tensor.
"""
residual
=
hidden_states
...
...
@@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module):
)
->
torch
.
Tensor
:
r
"""
Args:
inputs_embeds (`torch.FloatTensor` of shape
`(batch_size, sequence_length, hidden_size)`):
Optionally, instead of passing `input_ids` you can choose to
directly pass an embedded representation. This is useful if
you want more control over how to convert `input_ids` indices
into associated vectors than the model's internal embedding
lookup matrix.
grid_thws (`torch.LongTensor`):
grid shape (num_patches, 3)
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See
`hidden_states` under returned tensors for more detail.
return_dict (`bool`, *optional*):
inputs_embeds: Input tensor of shape
(batch_size, sequence_length, hidden_size).
Embedded representation of the input tokens.
grid_thws: Grid tensor of shape (num_patches, 3)
containing grid dimensions.
Whether or not to return a [`~utils.ModelOutput`] instead of
a plain tuple.
"""
...
...
vllm/model_executor/models/ultravox.py
View file @
9a8966bc
...
...
@@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
with the `input_ids`.
Args:
audio_features: A batch of audio input chunks [B, N, 80, M].
audio_lens: Length of audio frames for each audio chunk [B].
audio_token_len: Length of audio tokens for each audio chunk [B'].
Note: batch dim is different from batch dim in audio chunks.
input_ids: Flattened (concatenated) input_ids corresponding to a
batch.
positions: Position indices for the input tokens.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
"""
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment