Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9d1c4747
Unverified
Commit
9d1c4747
authored
Nov 12, 2025
by
Jee Jee Li
Committed by
GitHub
Nov 11, 2025
Browse files
[LoRA][1/N]Remove LoRA extra vocab (#28382)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
8c32c6e4
Changes
65
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
76 additions
and
306 deletions
+76
-306
vllm/model_executor/models/granitemoe.py
vllm/model_executor/models/granitemoe.py
+4
-23
vllm/model_executor/models/granitemoehybrid.py
vllm/model_executor/models/granitemoehybrid.py
+5
-22
vllm/model_executor/models/granitemoeshared.py
vllm/model_executor/models/granitemoeshared.py
+5
-23
vllm/model_executor/models/grok1.py
vllm/model_executor/models/grok1.py
+6
-20
vllm/model_executor/models/hunyuan_v1.py
vllm/model_executor/models/hunyuan_v1.py
+6
-15
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+0
-2
vllm/model_executor/models/jamba.py
vllm/model_executor/models/jamba.py
+6
-24
vllm/model_executor/models/kimi_vl.py
vllm/model_executor/models/kimi_vl.py
+2
-8
vllm/model_executor/models/lfm2.py
vllm/model_executor/models/lfm2.py
+5
-26
vllm/model_executor/models/lfm2_moe.py
vllm/model_executor/models/lfm2_moe.py
+6
-26
vllm/model_executor/models/llama_eagle3.py
vllm/model_executor/models/llama_eagle3.py
+0
-3
vllm/model_executor/models/longcat_flash.py
vllm/model_executor/models/longcat_flash.py
+1
-2
vllm/model_executor/models/mamba.py
vllm/model_executor/models/mamba.py
+6
-23
vllm/model_executor/models/mamba2.py
vllm/model_executor/models/mamba2.py
+5
-23
vllm/model_executor/models/medusa.py
vllm/model_executor/models/medusa.py
+3
-9
vllm/model_executor/models/mimo.py
vllm/model_executor/models/mimo.py
+0
-2
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm.py
+7
-23
vllm/model_executor/models/minicpm_eagle.py
vllm/model_executor/models/minicpm_eagle.py
+6
-23
vllm/model_executor/models/minimax_text_01.py
vllm/model_executor/models/minimax_text_01.py
+3
-8
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/mlp_speculator.py
+0
-1
No files found.
vllm/model_executor/models/granitemoe.py
View file @
9d1c4747
...
...
@@ -50,7 +50,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -296,22 +295,15 @@ class GraniteMoeModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
# Required by MixtralModel
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
embedding_multiplier
=
config
.
embedding_multiplier
...
...
@@ -518,26 +510,16 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
model
=
GraniteMoeModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -545,7 +527,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
scale
=
1
/
self
.
config
.
logits_scaling
,
)
...
...
vllm/model_executor/models/granitemoehybrid.py
View file @
9d1c4747
...
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -334,22 +333,15 @@ class GraniteMoeHybridModel(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
embedding_multiplier
=
config
.
embedding_multiplier
...
...
@@ -658,7 +650,7 @@ class GraniteMoeHybridForCausalLM(
config
=
vllm_config
.
model_config
.
hf_config
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
self
.
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
...
...
@@ -666,26 +658,17 @@ class GraniteMoeHybridForCausalLM(
self
.
model
=
GraniteMoeHybridModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
self
.
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
if
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
vocab_size
,
scale
=
1
/
self
.
config
.
logits_scaling
,
)
...
...
vllm/model_executor/models/granitemoeshared.py
View file @
9d1c4747
...
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import (
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -159,23 +158,16 @@ class GraniteMoeSharedModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
# Required by MixtralModel
self
.
padding_idx
=
config
.
pad_token_id
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
self
.
embedding_multiplier
=
config
.
embedding_multiplier
...
...
@@ -281,26 +273,16 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
model
=
GraniteMoeSharedModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -308,7 +290,7 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
vocab_size
,
scale
=
1
/
self
.
config
.
logits_scaling
,
)
...
...
vllm/model_executor/models/grok1.py
View file @
9d1c4747
...
...
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -305,18 +304,13 @@ class Grok1Model(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
padding_idx
=
config
.
pad_token_id
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embedding_multiplier_scale
=
getattr
(
config
,
"embedding_multiplier_scale"
,
DEFAULT_EMBEDDING_MULTIPLIER_SCALE
)
...
...
@@ -324,7 +318,6 @@ class Grok1Model(nn.Module):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
...
...
@@ -499,25 +492,18 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
Grok1Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -529,7 +515,7 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
config
,
"output_multiplier_scale"
,
DEFAULT_OUTPUT_MULTIPLIER_SCALE
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
self
.
output_multiplier_scale
config
.
vocab_size
,
scale
=
self
.
output_multiplier_scale
)
self
.
make_empty_intermediate_tensors
=
(
...
...
vllm/model_executor/models/hunyuan_v1.py
View file @
9d1c4747
...
...
@@ -57,7 +57,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -606,7 +605,7 @@ class HunYuanModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
eplb_config
=
vllm_config
.
parallel_config
.
eplb_config
enable_eplb
=
vllm_config
.
parallel_config
.
enable_eplb
self
.
num_redundant_experts
=
eplb_config
.
num_redundant_experts
...
...
@@ -614,20 +613,15 @@ class HunYuanModel(nn.Module):
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
padding_idx
=
config
.
pad_token_id
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
or
(
config
.
tie_word_embeddings
and
get_pp_group
().
is_last_rank
):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
else
:
...
...
@@ -937,12 +931,9 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
self
.
model
=
HunYuanModel
(
vllm_config
=
vllm_config
,
prefix
=
"model"
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
config
.
vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -951,7 +942,7 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
config
.
vocab_size
,
scale
=
logit_scale
)
else
:
self
.
lm_head
=
PPMissingLayer
()
...
...
vllm/model_executor/models/internlm2.py
View file @
9d1c4747
...
...
@@ -330,11 +330,9 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
lora_config
=
lora_config
self
.
model
=
model_type
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
...
...
vllm/model_executor/models/jamba.py
View file @
9d1c4747
...
...
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from
vllm.model_executor.layers.pooler
import
DispatchPooler
,
Pooler
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -307,21 +306,14 @@ class JambaModel(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
extra_kwargs
=
{
"is_lora_enabled"
:
bool
(
vllm_config
.
lora_config
)}
...
...
@@ -492,7 +484,7 @@ class JambaForCausalLM(
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
super
().
__init__
()
...
...
@@ -503,24 +495,14 @@ class JambaForCausalLM(
self
.
model
=
JambaModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
...
...
vllm/model_executor/models/kimi_vl.py
View file @
9d1c4747
...
...
@@ -60,7 +60,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -347,13 +346,10 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
vllm_config
=
sub_vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"language_model"
),
)
self
.
unpadded_vocab_size
=
config
.
text_config
.
vocab_size
if
get_pp_group
().
is_last_rank
:
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
text_config
.
hidden_size
,
org_num_embeddings
=
self
.
config
.
text_config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
else
:
...
...
@@ -362,9 +358,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
self
.
language_model
.
make_empty_intermediate_tensors
)
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
,
scale
=
logit_scale
)
self
.
media_placeholder
:
int
=
self
.
config
.
media_placeholder_token_id
def
_parse_and_validate_image_input
(
...
...
vllm/model_executor/models/lfm2.py
View file @
9d1c4747
...
...
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -316,16 +315,10 @@ class Lfm2Model(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
...
...
@@ -483,7 +476,7 @@ class Lfm2ForCausalLM(
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
cache_config
=
vllm_config
.
cache_config
lora_config
=
vllm_config
.
lora_config
assert
not
cache_config
.
enable_prefix_caching
,
(
"Lfm2 currently does not support prefix caching"
)
...
...
@@ -495,21 +488,9 @@ class Lfm2ForCausalLM(
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
self
.
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -517,9 +498,7 @@ class Lfm2ForCausalLM(
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
...
...
vllm/model_executor/models/lfm2_moe.py
View file @
9d1c4747
...
...
@@ -33,7 +33,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -423,20 +422,15 @@ class Lfm2MoeModel(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
parallel_config
=
vllm_config
.
parallel_config
enable_eplb
=
parallel_config
.
enable_eplb
eplb_config
=
parallel_config
.
eplb_config
self
.
num_redundant_experts
=
eplb_config
.
num_redundant_experts
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
...
...
@@ -662,7 +656,7 @@ class Lfm2MoeForCausalLM(
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
cache_config
=
vllm_config
.
cache_config
lora_config
=
vllm_config
.
lora_config
assert
not
cache_config
.
enable_prefix_caching
,
(
"Lfm2Moe currently does not support prefix caching"
)
...
...
@@ -674,21 +668,9 @@ class Lfm2MoeForCausalLM(
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
self
.
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -696,9 +678,7 @@ class Lfm2MoeForCausalLM(
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
...
...
vllm/model_executor/models/llama_eagle3.py
View file @
9d1c4747
...
...
@@ -15,7 +15,6 @@ from vllm.model_executor.layers.linear import QKVParallelLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -252,8 +251,6 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
self
.
lm_head
=
ParallelLMHead
(
self
.
config
.
draft_vocab_size
,
self
.
config
.
hidden_size
,
org_num_embeddings
=
self
.
config
.
draft_vocab_size
,
padding_size
=
(
DEFAULT_VOCAB_PADDING_SIZE
),
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
logits_processor
=
LogitsProcessor
(
...
...
vllm/model_executor/models/longcat_flash.py
View file @
9d1c4747
...
...
@@ -554,7 +554,6 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super
().
__init__
()
config
=
FlashConfig
(
**
vllm_config
.
model_config
.
hf_config
.
__dict__
)
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
config
.
intermediate_size
=
(
...
...
@@ -562,7 +561,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
if
hasattr
(
config
,
"ffn_hidden_size"
)
else
config
.
intermediate_size
)
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
self
.
model
=
FlashModel
(
...
...
vllm/model_executor/models/mamba.py
View file @
9d1c4747
...
...
@@ -21,7 +21,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -110,18 +109,12 @@ class MambaModel(nn.Module):
is_lora_enabled
=
bool
(
lora_config
)
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embeddings
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
...
...
@@ -199,7 +192,7 @@ class MambaForCausalLM(
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
lora_config
=
vllm_config
.
lora_config
self
.
scheduler_config
=
vllm_config
.
scheduler_config
super
().
__init__
()
...
...
@@ -209,27 +202,17 @@ class MambaForCausalLM(
self
.
backbone
=
MambaModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"backbone"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
if
config
.
tie_word_embeddings
:
self
.
lm_head
=
self
.
backbone
.
embeddings
else
:
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
backbone
.
make_empty_intermediate_tensors
...
...
vllm/model_executor/models/mamba2.py
View file @
9d1c4747
...
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -107,18 +106,12 @@ class Mamba2Model(nn.Module):
assert
not
is_lora_enabled
self
.
config
=
config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embeddings
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
...
...
@@ -238,7 +231,7 @@ class Mamba2ForCausalLM(
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
super
().
__init__
()
...
...
@@ -249,27 +242,16 @@ class Mamba2ForCausalLM(
self
.
backbone
=
Mamba2Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"backbone"
)
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
if
config
.
tie_word_embeddings
:
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
backbone
.
embeddings
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
backbone
.
make_empty_intermediate_tensors
...
...
vllm/model_executor/models/medusa.py
View file @
9d1c4747
...
...
@@ -9,7 +9,6 @@ import torch.nn as nn
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -70,14 +69,11 @@ class Medusa(nn.Module):
)
self
.
orig_vocab_size
=
config
.
vocab_size
self
.
truncated_vocab_size
=
config
.
truncated_vocab_size
self
.
unpadded_vocab_size
=
self
.
truncated_vocab_size
if
getattr
(
config
,
"original_lm_head"
,
False
):
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadd
ed_vocab_size
,
self
.
truncat
ed_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
self
.
truncated_vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
lm_heads
=
[
self
.
lm_head
for
_
in
range
(
self
.
config
.
num_heads
)]
...
...
@@ -85,10 +81,8 @@ class Medusa(nn.Module):
self
.
lm_heads
=
nn
.
ModuleList
(
[
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
self
.
truncated_vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
prefix
=
maybe_prefix
(
prefix
,
f
"lm_heads.
{
i
}
"
),
)
for
i
in
range
(
self
.
config
.
num_heads
)
...
...
@@ -97,7 +91,7 @@ class Medusa(nn.Module):
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_
vocab_size
,
self
.
truncated_vocab_size
,
logit_scale
config
.
vocab_size
,
self
.
truncated_vocab_size
,
logit_scale
)
# Token map is a idx to token mapping to reduce the vocab size for
...
...
vllm/model_executor/models/mimo.py
View file @
9d1c4747
...
...
@@ -151,10 +151,8 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
nn
.
Module
.
__init__
(
self
)
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
quant_config
=
quant_config
...
...
vllm/model_executor/models/minicpm.py
View file @
9d1c4747
...
...
@@ -55,7 +55,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -405,22 +404,16 @@ class MiniCPMModel(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
num_experts
=
getattr
(
self
.
config
,
"num_experts"
,
0
)
self
.
_init_layers
(
prefix
,
config
,
cache_config
,
quant_config
)
...
...
@@ -588,13 +581,13 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
parallel_config
=
vllm_config
.
parallel_config
self
.
prefix
=
prefix
self
.
vllm_config
=
vllm_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
...
...
@@ -602,18 +595,9 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -621,7 +605,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
self
.
scale_width
=
self
.
config
.
hidden_size
/
self
.
config
.
dim_model_base
self
.
logits_processor
=
LogitsProcessor
(
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/minicpm_eagle.py
View file @
9d1c4747
...
...
@@ -37,7 +37,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -151,18 +150,13 @@ class EagleMiniCPMModel(nn.Module):
config
=
vllm_config
.
speculative_config
.
draft_model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
lora_vocab
=
(
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
fc
=
torch
.
nn
.
Linear
(
self
.
config
.
hidden_size
*
2
,
self
.
config
.
hidden_size
,
bias
=
False
)
...
...
@@ -171,7 +165,6 @@ class EagleMiniCPMModel(nn.Module):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
num_experts
=
getattr
(
self
.
config
,
"num_experts"
,
0
)
self
.
_init_layers
(
prefix
,
config
,
cache_config
,
quant_config
,
start_layer
)
...
...
@@ -321,12 +314,11 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
config
=
vllm_config
.
speculative_config
.
draft_model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
prefix
=
prefix
self
.
vllm_config
=
vllm_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
...
...
@@ -340,18 +332,9 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
start_layer
=
target_layer_num
,
)
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
unpadded_
vocab_size
,
config
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
...
...
@@ -359,7 +342,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
self
.
scale_width
=
self
.
config
.
hidden_size
/
self
.
config
.
dim_model_base
self
.
logits_processor
=
LogitsProcessor
(
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/minimax_text_01.py
View file @
9d1c4747
...
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
,
)
...
...
@@ -669,16 +668,14 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
if
not
hasattr
(
config
,
"sliding_window"
):
config
.
sliding_window
=
None
self
.
CONCAT_FFN
=
True
self
.
unpadded_vocab_size
=
self
.
config
.
vocab_size
if
hasattr
(
vllm_config
.
model_config
,
"max_model_len"
):
self
.
config
.
max_model_len
=
vllm_config
.
model_config
.
max_model_len
self
.
model
=
MiniMaxText01Model
(
...
...
@@ -686,15 +683,13 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
)
if
get_pp_group
().
is_last_rank
:
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_
vocab_size
,
config
.
vocab_size
,
self
.
config
.
hidden_size
,
org_num_embeddings
=
self
.
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_
vocab_size
,
self
.
config
.
vocab_size
config
.
vocab_size
,
self
.
config
.
vocab_size
)
else
:
...
...
vllm/model_executor/models/mlp_speculator.py
View file @
9d1c4747
...
...
@@ -123,7 +123,6 @@ class MLPSpeculator(nn.Module):
VocabParallelEmbedding
(
config
.
vocab_size
,
self
.
inner_dim
,
org_num_embeddings
=
config
.
vocab_size
,
)
for
_
in
range
(
self
.
max_speculative_tokens
)
]
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment