Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
de533ab2
Unverified
Commit
de533ab2
authored
Aug 29, 2025
by
Lukas Geiger
Committed by
GitHub
Aug 29, 2025
Browse files
[Models] Improve iteration over layers (#19497)
Signed-off-by:
Lukas Geiger
<
lukas.geiger94@gmail.com
>
parent
235c9db8
Changes
65
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
39 additions
and
23 deletions
+39
-23
vllm/model_executor/models/minimax_text_01.py
vllm/model_executor/models/minimax_text_01.py
+2
-2
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+2
-1
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mixtral_quant.py
+2
-1
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+2
-1
vllm/model_executor/models/mpt.py
vllm/model_executor/models/mpt.py
+2
-1
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+2
-1
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h.py
+1
-2
vllm/model_executor/models/nemotron_nas.py
vllm/model_executor/models/nemotron_nas.py
+2
-2
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+2
-1
vllm/model_executor/models/olmo2.py
vllm/model_executor/models/olmo2.py
+2
-1
vllm/model_executor/models/olmoe.py
vllm/model_executor/models/olmoe.py
+2
-1
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+2
-1
vllm/model_executor/models/orion.py
vllm/model_executor/models/orion.py
+2
-1
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/persimmon.py
+2
-1
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+2
-1
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+2
-1
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo2.py
+2
-1
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+2
-1
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+2
-1
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_moe.py
+2
-1
No files found.
vllm/model_executor/models/minimax_text_01.py
View file @
de533ab2
...
...
@@ -3,6 +3,7 @@
"""Inference-only MiniMaxText01 model."""
import
math
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
TYPE_CHECKING
,
Optional
,
Union
if
TYPE_CHECKING
:
...
...
@@ -1019,8 +1020,7 @@ class MiniMaxText01Model(nn.Module):
minimax_cache_index
=
0
for
i
in
range
(
self
.
start_layer
,
self
.
end_layer
):
layer
=
self
.
layers
[
i
]
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
):
_caches
=
None
if
not
envs
.
VLLM_USE_V1
and
isinstance
(
layer
.
self_attn
,
MiniMaxText01LinearAttention
):
...
...
vllm/model_executor/models/mixtral.py
View file @
de533ab2
...
...
@@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only Mixtral model."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -307,7 +308,7 @@ class MixtralModel(nn.Module):
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
...
...
vllm/model_executor/models/mixtral_quant.py
View file @
de533ab2
...
...
@@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only Mixtral model."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
numpy
as
np
...
...
@@ -346,7 +347,7 @@ class MixtralModel(nn.Module):
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
...
...
vllm/model_executor/models/molmo.py
View file @
de533ab2
...
...
@@ -5,6 +5,7 @@ import math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
dataclasses
import
dataclass
from
functools
import
cached_property
,
partial
from
itertools
import
islice
from
typing
import
Annotated
,
Optional
,
Union
import
numpy
as
np
...
...
@@ -842,7 +843,7 @@ class MolmoModel(nn.Module, SupportsQuant):
residual
=
intermediate_tensors
[
"residual"
]
# Apply blocks one-by-one.
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
...
...
vllm/model_executor/models/mpt.py
View file @
de533ab2
...
...
@@ -4,6 +4,7 @@
# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
import
math
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -260,7 +261,7 @@ class MPTModel(nn.Module):
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
block
in
self
.
blocks
[
self
.
start_layer
:
self
.
end_layer
]
:
for
block
in
islice
(
self
.
blocks
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
=
block
(
position_ids
,
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
})
...
...
vllm/model_executor/models/nemotron.py
View file @
de533ab2
...
...
@@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only Nemotron model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -353,7 +354,7 @@ class NemotronModel(nn.Module):
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
if
not
get_pp_group
().
is_last_rank
:
...
...
vllm/model_executor/models/nemotron_h.py
View file @
de533ab2
...
...
@@ -399,8 +399,7 @@ class NemotronHModel(nn.Module):
residual
=
None
num_non_mamba_layers
=
0
for
i
in
range
(
len
(
self
.
layers
)):
layer
=
self
.
layers
[
i
]
for
i
,
layer
in
enumerate
(
self
.
layers
):
layer_mamba_cache_params
=
None
if
isinstance
(
layer
,
NemotronHMambaDecoderLayer
)
and
mamba_cache_params
:
...
...
vllm/model_executor/models/nemotron_nas.py
View file @
de533ab2
...
...
@@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only deci model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -287,8 +288,7 @@ class DeciModel(nn.Module):
residual
=
intermediate_tensors
[
"residual"
]
kv_cache_index
=
0
for
i
in
range
(
self
.
start_layer
,
self
.
end_layer
):
layer
=
self
.
layers
[
i
]
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
):
if
not
layer
.
_is_no_op_attention
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
...
...
vllm/model_executor/models/olmo.py
View file @
de533ab2
...
...
@@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only OLMo model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -280,7 +281,7 @@ class OlmoModel(nn.Module):
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
# Apply blocks one-by-one.
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
# shape: (batch_size, seq_len, d_model)
hidden_states
=
layer
(
positions
,
hidden_states
)
...
...
vllm/model_executor/models/olmo2.py
View file @
de533ab2
...
...
@@ -26,6 +26,7 @@
from
collections.abc
import
Iterable
from
functools
import
partial
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -305,7 +306,7 @@ class Olmo2Model(nn.Module):
assert
isinstance
(
hidden_states
,
torch
.
Tensor
)
# Apply blocks one-by-one.
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
# shape: (batch_size, seq_len, d_model)
hidden_states
=
layer
(
positions
,
hidden_states
)
...
...
vllm/model_executor/models/olmoe.py
View file @
de533ab2
...
...
@@ -15,6 +15,7 @@
"""Inference-only OLMoE model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
functools
import
partial
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -314,7 +315,7 @@ class OlmoeModel(nn.Module):
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
...
...
vllm/model_executor/models/opt.py
View file @
de533ab2
...
...
@@ -20,6 +20,7 @@
# limitations under the License.
"""Inference-only OPT model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -269,7 +270,7 @@ class OPTDecoder(nn.Module):
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
=
layer
(
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
...
...
vllm/model_executor/models/orion.py
View file @
de533ab2
...
...
@@ -7,6 +7,7 @@
# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
"""Inference-only Orion-14B model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -252,7 +253,7 @@ class OrionModel(nn.Module):
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
=
layer
(
positions
,
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
...
...
vllm/model_executor/models/persimmon.py
View file @
de533ab2
...
...
@@ -23,6 +23,7 @@
# limitations under the License.
"""Inference-only persimmon model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -255,7 +256,7 @@ class PersimmonModel(nn.Module):
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
=
layer
(
positions
,
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
})
...
...
vllm/model_executor/models/phi.py
View file @
de533ab2
...
...
@@ -38,6 +38,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Inference-only Phi-1.5 model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -240,7 +241,7 @@ class PhiModel(nn.Module):
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
=
layer
(
positions
,
hidden_states
)
if
not
get_pp_group
().
is_last_rank
:
...
...
vllm/model_executor/models/phimoe.py
View file @
de533ab2
...
...
@@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only PhiMoE model."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -500,7 +501,7 @@ class PhiMoEModel(nn.Module):
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
...
...
vllm/model_executor/models/plamo2.py
View file @
de533ab2
...
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Inference-only PLaMo2 model."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Optional
import
torch
...
...
@@ -614,7 +615,7 @@ class Plamo2Decoder(torch.nn.Module):
mamba2_metadata
:
Mamba2Metadata
,
)
->
torch
.
Tensor
:
mamba_cache_index
=
0
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
layer_mamba_cache_params
=
None
if
layer
.
is_mamba
:
layer_mamba_cache_params
=
mamba_cache_params
.
at_layer_idx
(
...
...
vllm/model_executor/models/qwen.py
View file @
de533ab2
...
...
@@ -8,6 +8,7 @@
"""Inference-only QWen model compatible with HuggingFace weights."""
import
json
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -234,7 +235,7 @@ class QWenModel(nn.Module):
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
h
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
h
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
...
...
vllm/model_executor/models/qwen2.py
View file @
de533ab2
...
...
@@ -25,6 +25,7 @@
# limitations under the License.
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -358,7 +359,7 @@ class Qwen2Model(nn.Module):
aux_hidden_states
=
[]
for
idx
,
layer
in
enumerate
(
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
):
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
):
if
idx
in
self
.
aux_hidden_state_layers
:
aux_hidden_states
.
append
(
hidden_states
+
residual
)
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
...
...
vllm/model_executor/models/qwen2_moe.py
View file @
de533ab2
...
...
@@ -25,6 +25,7 @@
# limitations under the License.
"""Inference-only Qwen2MoE model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
itertools
import
islice
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -381,7 +382,7 @@ class Qwen2MoeModel(nn.Module):
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]
:
for
layer
in
islice
(
self
.
layers
,
self
.
start_layer
,
self
.
end_layer
)
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment