Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4a9375fe
Unverified
Commit
4a9375fe
authored
Sep 17, 2025
by
whx
Committed by
GitHub
Sep 17, 2025
Browse files
[Model] Pass param prefix to LLMHead (#24862)
Signed-off-by:
whx-sjtu
<
2952154980@qq.com
>
parent
03191cd8
Changes
58
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
30 additions
and
11 deletions
+30
-11
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h.py
+1
-0
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+1
-0
vllm/model_executor/models/olmoe.py
vllm/model_executor/models/olmoe.py
+2
-1
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+3
-1
vllm/model_executor/models/orion.py
vllm/model_executor/models/orion.py
+2
-1
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/persimmon.py
+2
-1
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+2
-1
vllm/model_executor/models/phi4flash.py
vllm/model_executor/models/phi4flash.py
+1
-0
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+1
-0
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+1
-0
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+2
-1
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_moe.py
+2
-1
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+2
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+1
-1
vllm/model_executor/models/qwen3_next_mtp.py
vllm/model_executor/models/qwen3_next_mtp.py
+2
-1
vllm/model_executor/models/solar.py
vllm/model_executor/models/solar.py
+1
-0
vllm/model_executor/models/step3_text.py
vllm/model_executor/models/step3_text.py
+3
-1
vllm/model_executor/models/zamba2.py
vllm/model_executor/models/zamba2.py
+1
-0
No files found.
vllm/model_executor/models/nemotron_h.py
View file @
4a9375fe
...
@@ -565,6 +565,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
...
@@ -565,6 +565,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
# We need bigger padding if using lora for kernel
# We need bigger padding if using lora for kernel
# compatibility
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
# Used to track and store by the Mamba cache between steps.
# Used to track and store by the Mamba cache between steps.
self
.
mamba_cache
:
Optional
[
MambaCacheManager
]
=
None
self
.
mamba_cache
:
Optional
[
MambaCacheManager
]
=
None
...
...
vllm/model_executor/models/olmo.py
View file @
4a9375fe
...
@@ -364,6 +364,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
...
@@ -364,6 +364,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
config
.
hidden_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
...
...
vllm/model_executor/models/olmoe.py
View file @
4a9375fe
...
@@ -450,7 +450,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
...
@@ -450,7 +450,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
))
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
...
...
vllm/model_executor/models/opt.py
View file @
4a9375fe
...
@@ -375,7 +375,9 @@ class OPTForCausalLM(nn.Module, SupportsPP):
...
@@ -375,7 +375,9 @@ class OPTForCausalLM(nn.Module, SupportsPP):
self
.
lm_head
=
self
.
model
.
decoder
.
embed_tokens
self
.
lm_head
=
self
.
model
.
decoder
.
embed_tokens
else
:
else
:
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
word_embed_proj_dim
)
config
.
word_embed_proj_dim
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/orion.py
View file @
4a9375fe
...
@@ -314,7 +314,8 @@ class OrionForCausalLM(nn.Module, SupportsPP):
...
@@ -314,7 +314,8 @@ class OrionForCausalLM(nn.Module, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
))
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
if
self
.
config
.
tie_word_embeddings
:
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
...
...
vllm/model_executor/models/persimmon.py
View file @
4a9375fe
...
@@ -307,7 +307,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
...
@@ -307,7 +307,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
))
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
bias
=
False
)
bias
=
False
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/phi.py
View file @
4a9375fe
...
@@ -322,7 +322,8 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -322,7 +322,8 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
bias
=
True
,
bias
=
True
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/phi4flash.py
View file @
4a9375fe
...
@@ -630,6 +630,7 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
...
@@ -630,6 +630,7 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
# compatibility
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
self
.
embedding_bias
=
None
self
.
embedding_bias
=
None
# Used to track and store by the Mamba cache between steps.
# Used to track and store by the Mamba cache between steps.
...
...
vllm/model_executor/models/phi4mm.py
View file @
4a9375fe
...
@@ -989,6 +989,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
...
@@ -989,6 +989,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
org_num_embeddings
=
config
.
vocab_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
if
config
.
tie_word_embeddings
:
if
config
.
tie_word_embeddings
:
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
...
...
vllm/model_executor/models/phimoe.py
View file @
4a9375fe
...
@@ -645,6 +645,7 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -645,6 +645,7 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
quant_config
=
None
,
quant_config
=
None
,
bias
=
True
,
bias
=
True
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
config
.
vocab_size
)
...
...
vllm/model_executor/models/qwen.py
View file @
4a9375fe
...
@@ -271,7 +271,8 @@ class QWenBaseModel(nn.Module):
...
@@ -271,7 +271,8 @@ class QWenBaseModel(nn.Module):
prefix
,
"transformer"
))
prefix
,
"transformer"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
if
self
.
config
.
tie_word_embeddings
:
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
transformer
.
wte
.
weight
self
.
lm_head
.
weight
=
self
.
transformer
.
wte
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
...
...
vllm/model_executor/models/qwen2_moe.py
View file @
4a9375fe
...
@@ -519,7 +519,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
...
@@ -519,7 +519,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
prefix
=
maybe_prefix
(
prefix
,
"model"
))
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
if
self
.
config
.
tie_word_embeddings
:
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
4a9375fe
...
@@ -605,7 +605,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
...
@@ -605,7 +605,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
if
self
.
config
.
tie_word_embeddings
:
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
...
...
vllm/model_executor/models/qwen3_next.py
View file @
4a9375fe
...
@@ -1089,7 +1089,7 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
...
@@ -1089,7 +1089,7 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
# We need bigger padding if using lora for kernel
# We need bigger padding if using lora for kernel
# compatibility
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
)
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
)
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
...
...
vllm/model_executor/models/qwen3_next_mtp.py
View file @
4a9375fe
...
@@ -238,7 +238,8 @@ class Qwen3NextMTP(nn.Module, SupportsPP):
...
@@ -238,7 +238,8 @@ class Qwen3NextMTP(nn.Module, SupportsPP):
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
)
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
...
...
vllm/model_executor/models/solar.py
View file @
4a9375fe
...
@@ -469,6 +469,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -469,6 +469,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# compatibility
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
if
config
.
tie_word_embeddings
:
if
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
...
...
vllm/model_executor/models/step3_text.py
View file @
4a9375fe
...
@@ -35,7 +35,8 @@ from vllm.sequence import IntermediateTensors
...
@@ -35,7 +35,8 @@ from vllm.sequence import IntermediateTensors
from
.interfaces
import
SupportsPP
from
.interfaces
import
SupportsPP
from
.utils
import
(
PPMissingLayer
,
is_pp_missing_parameter
,
from
.utils
import
(
PPMissingLayer
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
)
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -386,6 +387,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
...
@@ -386,6 +387,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
org_num_embeddings
=
config
.
vocab_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
config
.
vocab_size
)
...
...
vllm/model_executor/models/zamba2.py
View file @
4a9375fe
...
@@ -941,6 +941,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
...
@@ -941,6 +941,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
# We need bigger padding if using lora for kernel
# We need bigger padding if using lora for kernel
# compatibility
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
)
# Tie weights with input embeddings if using same dimensions
# Tie weights with input embeddings if using same dimensions
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment