Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9d1c4747
Unverified
Commit
9d1c4747
authored
Nov 12, 2025
by
Jee Jee Li
Committed by
GitHub
Nov 11, 2025
Browse files
[LoRA][1/N]Remove LoRA extra vocab (#28382)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
8c32c6e4
Changes
65
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
54 additions
and
192 deletions
+54
-192
vllm/model_executor/models/apertus.py
vllm/model_executor/models/apertus.py
+5
-25
vllm/model_executor/models/arcee.py
vllm/model_executor/models/arcee.py
+2
-8
vllm/model_executor/models/arctic.py
vllm/model_executor/models/arctic.py
+2
-4
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+2
-6
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+2
-2
vllm/model_executor/models/bailing_moe.py
vllm/model_executor/models/bailing_moe.py
+0
-2
vllm/model_executor/models/bamba.py
vllm/model_executor/models/bamba.py
+6
-24
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+3
-5
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+1
-2
vllm/model_executor/models/commandr.py
vllm/model_executor/models/commandr.py
+6
-13
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/dbrx.py
+2
-7
vllm/model_executor/models/exaone.py
vllm/model_executor/models/exaone.py
+5
-22
vllm/model_executor/models/exaone4.py
vllm/model_executor/models/exaone4.py
+4
-22
vllm/model_executor/models/falcon_h1.py
vllm/model_executor/models/falcon_h1.py
+7
-24
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+0
-2
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma2.py
+1
-2
vllm/model_executor/models/gemma3.py
vllm/model_executor/models/gemma3.py
+1
-2
vllm/model_executor/models/gemma3n.py
vllm/model_executor/models/gemma3n.py
+1
-2
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+0
-2
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_bigcode.py
+4
-16
No files found.
vllm/model_executor/models/apertus.py
View file @
9d1c4747
...
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -346,24 +345,18 @@ class ApertusModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
or
(
config
.
tie_word_embeddings
and
get_pp_group
().
is_last_rank
):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
else
:
...
...
@@ -518,9 +511,7 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
model
=
self
.
_init_model
(
vllm_config
=
vllm_config
,
...
...
@@ -529,20 +520,9 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -551,7 +531,7 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
config
.
vocab_size
,
scale
=
logit_scale
)
else
:
self
.
lm_head
=
PPMissingLayer
()
...
...
vllm/model_executor/models/arcee.py
View file @
9d1c4747
...
...
@@ -23,7 +23,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
,
RowParallelLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -200,7 +199,6 @@ class ArceeModel(nn.Module):
self
.
quant_config
=
quant_config
self
.
config
=
config
self
.
vocab_size
=
config
.
vocab_size
self
.
org_vocab_size
=
config
.
vocab_size
# Word embeddings (parallelized if using pipeline parallel)
if
get_pp_group
().
is_first_rank
or
(
...
...
@@ -209,7 +207,6 @@ class ArceeModel(nn.Module):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
else
:
...
...
@@ -383,13 +380,10 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
if
get_pp_group
().
is_last_rank
:
# Determine vocabulary size (including any LoRA extra tokens
# for padded LM head)
self
.
unpadded_vocab_size
=
config
.
vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
vllm_config
.
quant_config
,
bias
=
getattr
(
config
,
"lm_head_bias"
,
False
),
prefix
=
f
"
{
prefix
}
.lm_head"
,
...
...
@@ -399,7 +393,7 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
config
.
vocab_size
,
scale
=
logit_scale
)
else
:
# Placeholder for lm_head on non-last ranks
...
...
vllm/model_executor/models/arctic.py
View file @
9d1c4747
...
...
@@ -490,10 +490,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
num_experts
=
config
.
num_local_experts
self
.
num_experts_per_tok
=
config
.
num_experts_per_tok
self
.
unpadded_vocab_size
=
config
.
vocab_size
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/aria.py
View file @
9d1c4747
...
...
@@ -547,18 +547,14 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
self
.
pad_token_id
=
(
self
.
config
.
pad_token_id
if
self
.
config
.
pad_token_id
is
not
None
else
-
1
)
self
.
unpadded_vocab_size
=
config
.
text_config
.
vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
self
.
vocab_size
,
config
.
text_config
.
hidden_size
,
org_num_embeddings
=
self
.
language_model
.
org_vocab_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
vocab_size
,
logit_scale
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
vocab_size
,
scale
=
logit_scale
)
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
...
...
vllm/model_executor/models/baichuan.py
View file @
9d1c4747
...
...
@@ -402,9 +402,9 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
quant_config
=
quant_config
self
.
model
=
BaiChuanModel
(
...
...
vllm/model_executor/models/bailing_moe.py
View file @
9d1c4747
...
...
@@ -581,10 +581,8 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
config
=
vllm_config
.
model_config
.
hf_config
.
get_text_config
()
vllm_config
.
model_config
.
hf_config
=
config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
max_position_embeddings
=
config
.
max_position_embeddings
self
.
model
=
BailingMoeModel
(
...
...
vllm/model_executor/models/bamba.py
View file @
9d1c4747
...
...
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -284,21 +283,14 @@ class BambaModel(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
def
get_layer
(
prefix
:
str
):
...
...
@@ -478,7 +470,7 @@ class BambaForCausalLM(
config
=
vllm_config
.
model_config
.
hf_config
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
self
.
quant_config
=
vllm_config
.
quant_config
...
...
@@ -488,24 +480,14 @@ class BambaForCausalLM(
self
.
model
=
BambaModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
...
...
vllm/model_executor/models/chameleon.py
View file @
9d1c4747
...
...
@@ -963,9 +963,9 @@ class ChameleonForConditionalGeneration(
self
.
model
=
ChameleonModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -973,9 +973,7 @@ class ChameleonForConditionalGeneration(
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
,
scale
=
logit_scale
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/chatglm.py
View file @
9d1c4747
...
...
@@ -433,10 +433,9 @@ class ChatGLMBaseModel(nn.Module):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
multimodal_config
=
multimodal_config
self
.
quant_config
=
quant_config
...
...
vllm/model_executor/models/commandr.py
View file @
9d1c4747
...
...
@@ -288,17 +288,12 @@ class CohereModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
quant_config
=
quant_config
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
)
...
...
@@ -424,17 +419,15 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
# currently all existing command R models have `tie_word_embeddings`
# enabled
assert
config
.
tie_word_embeddings
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
quant_config
=
quant_config
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
scale
=
config
.
logit_scale
config
.
vocab_size
,
scale
=
config
.
logit_scale
)
self
.
model
=
CohereModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
...
...
vllm/model_executor/models/dbrx.py
View file @
9d1c4747
...
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -441,21 +440,17 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
if
config
.
tie_word_embeddings
:
raise
ValueError
(
"tie_word_embeddings is not supported for Dbrx models."
)
self
.
quant_config
=
quant_config
self
.
unpadded_vocab_size
=
config
.
vocab_size
self
.
transformer
=
DbrxModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"transformer"
)
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
d_model
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/exaone.py
View file @
9d1c4747
...
...
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -323,16 +322,11 @@ class ExaoneModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
vocab_size
=
config
.
vocab_size
self
.
wte
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
or
(
config
.
tie_word_embeddings
and
get_pp_group
().
is_last_rank
...
...
@@ -340,7 +334,6 @@ class ExaoneModel(nn.Module):
self
.
wte
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
else
:
...
...
@@ -489,10 +482,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
transformer
=
ExaoneModel
(
...
...
@@ -500,18 +492,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
),
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -520,7 +503,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
config
.
vocab_size
,
scale
=
logit_scale
)
else
:
self
.
lm_head
=
PPMissingLayer
()
...
...
vllm/model_executor/models/exaone4.py
View file @
9d1c4747
...
...
@@ -44,7 +44,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -311,23 +310,17 @@ class Exaone4Model(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
vocab_size
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
or
(
config
.
tie_word_embeddings
and
get_pp_group
().
is_last_rank
):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
else
:
...
...
@@ -476,10 +469,8 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
Exaone4Model
(
...
...
@@ -487,18 +478,9 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
),
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -507,7 +489,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
config
.
vocab_size
,
scale
=
logit_scale
)
else
:
self
.
lm_head
=
PPMissingLayer
()
...
...
vllm/model_executor/models/falcon_h1.py
View file @
9d1c4747
...
...
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -424,21 +423,15 @@ class FalconH1Model(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
:
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
embedding_multiplier
=
config
.
embedding_multiplier
else
:
...
...
@@ -572,7 +565,7 @@ class FalconH1ForCausalLM(
config
=
vllm_config
.
model_config
.
hf_config
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
self
.
quant_config
=
vllm_config
.
quant_config
...
...
@@ -584,21 +577,11 @@ class FalconH1ForCausalLM(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
tie_word_embeddings
=
config
.
tie_word_embeddings
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
if
get_pp_group
().
is_last_rank
:
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
lm_head_multiplier
=
config
.
lm_head_multiplier
...
...
@@ -607,7 +590,7 @@ class FalconH1ForCausalLM(
# Used to track and store by the Mamba cache between steps.
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
vocab_size
,
scale
=
config
.
lm_head_multiplier
,
)
...
...
vllm/model_executor/models/gemma.py
View file @
9d1c4747
...
...
@@ -382,12 +382,10 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
# currently all existing Gemma models have `tie_word_embeddings` enabled
assert
config
.
tie_word_embeddings
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
GemmaModel
(
...
...
vllm/model_executor/models/gemma2.py
View file @
9d1c4747
...
...
@@ -393,8 +393,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
del
lora_config
# Unused.
super
().
__init__
()
self
.
config
=
config
# currently all existing Gemma models have `tie_word_embeddings` enabled
...
...
vllm/model_executor/models/gemma3.py
View file @
9d1c4747
...
...
@@ -524,8 +524,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
del
lora_config
# Unused.
super
().
__init__
()
self
.
config
=
config
# currently all existing Gemma models have `tie_word_embeddings` enabled
...
...
vllm/model_executor/models/gemma3n.py
View file @
9d1c4747
...
...
@@ -1114,8 +1114,7 @@ class Gemma3nForCausalLM(nn.Module):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
lora_config
=
vllm_config
.
lora_config
del
lora_config
# Unused.
super
().
__init__
()
self
.
config
=
config
self
.
cache_config
=
vllm_config
.
cache_config
...
...
vllm/model_executor/models/glm4.py
View file @
9d1c4747
...
...
@@ -248,10 +248,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
Glm4Model
(
...
...
vllm/model_executor/models/gpt_bigcode.py
View file @
9d1c4747
...
...
@@ -207,18 +207,13 @@ class GPTBigCodeModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
assert
not
config
.
add_cross_attention
self
.
embed_dim
=
config
.
hidden_size
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
vocab_size
=
config
.
vocab_size
self
.
wte
=
VocabParallelEmbedding
(
self
.
vocab_size
,
self
.
embed_dim
,
org_num_embeddings
=
config
.
vocab_size
)
...
...
@@ -290,10 +285,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
transformer
=
GPTBigCodeModel
(
...
...
@@ -305,15 +298,10 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
=
ParallelLMHead
(
self
.
transformer
.
vocab_size
,
self
.
transformer
.
embed_dim
,
org_num_embeddings
=
self
.
config
.
vocab_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment