"vllm/vscode:/vscode.git/clone" did not exist on "4e8af7e899ed286c77922d6002b9759ae859aa1f"
Unverified Commit 9d1c4747 authored by Jee Jee Li's avatar Jee Jee Li Committed by GitHub
Browse files

[LoRA][1/N]Remove LoRA extra vocab (#28382)


Signed-off-by: default avatarJee Jee Li <pandaleefree@gmail.com>
parent 8c32c6e4
...@@ -50,7 +50,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -50,7 +50,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -296,22 +295,15 @@ class GraniteMoeModel(nn.Module): ...@@ -296,22 +295,15 @@ class GraniteMoeModel(nn.Module):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.quant_config = quant_config # Required by MixtralModel self.quant_config = quant_config # Required by MixtralModel
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
self.embedding_multiplier = config.embedding_multiplier self.embedding_multiplier = config.embedding_multiplier
...@@ -518,26 +510,16 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -518,26 +510,16 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.lora_config = lora_config
self.model = GraniteMoeModel( self.model = GraniteMoeModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -545,7 +527,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -545,7 +527,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self.lm_head.weight = self.model.embed_tokens.weight self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size,
config.vocab_size, config.vocab_size,
scale=1 / self.config.logits_scaling, scale=1 / self.config.logits_scaling,
) )
......
...@@ -25,7 +25,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ...@@ -25,7 +25,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -334,22 +333,15 @@ class GraniteMoeHybridModel(nn.Module): ...@@ -334,22 +333,15 @@ class GraniteMoeHybridModel(nn.Module):
model_config = vllm_config.model_config model_config = vllm_config.model_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.quant_config = quant_config self.quant_config = quant_config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
self.embedding_multiplier = config.embedding_multiplier self.embedding_multiplier = config.embedding_multiplier
...@@ -658,7 +650,7 @@ class GraniteMoeHybridForCausalLM( ...@@ -658,7 +650,7 @@ class GraniteMoeHybridForCausalLM(
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.model_config = vllm_config.model_config self.model_config = vllm_config.model_config
lora_config = vllm_config.lora_config
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
self.quant_config = vllm_config.quant_config self.quant_config = vllm_config.quant_config
self.config = config self.config = config
...@@ -666,26 +658,17 @@ class GraniteMoeHybridForCausalLM( ...@@ -666,26 +658,17 @@ class GraniteMoeHybridForCausalLM(
self.model = GraniteMoeHybridModel( self.model = GraniteMoeHybridModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
quant_config=self.quant_config, quant_config=self.quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
if config.tie_word_embeddings: if config.tie_word_embeddings:
self.lm_head.weight = self.model.embed_tokens.weight self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size,
config.vocab_size, config.vocab_size,
scale=1 / self.config.logits_scaling, scale=1 / self.config.logits_scaling,
) )
......
...@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ( ...@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import (
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -159,23 +158,16 @@ class GraniteMoeSharedModel(nn.Module): ...@@ -159,23 +158,16 @@ class GraniteMoeSharedModel(nn.Module):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.quant_config = quant_config # Required by MixtralModel self.quant_config = quant_config # Required by MixtralModel
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config, quant_config=quant_config,
) )
self.embedding_multiplier = config.embedding_multiplier self.embedding_multiplier = config.embedding_multiplier
...@@ -281,26 +273,16 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -281,26 +273,16 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.lora_config = lora_config
self.model = GraniteMoeSharedModel( self.model = GraniteMoeSharedModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -308,7 +290,7 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -308,7 +290,7 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self.lm_head.weight = self.model.embed_tokens.weight self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size,
config.vocab_size, config.vocab_size,
scale=1 / self.config.logits_scaling, scale=1 / self.config.logits_scaling,
) )
......
...@@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -305,18 +304,13 @@ class Grok1Model(nn.Module): ...@@ -305,18 +304,13 @@ class Grok1Model(nn.Module):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.quant_config = quant_config self.quant_config = quant_config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embedding_multiplier_scale = getattr( self.embedding_multiplier_scale = getattr(
config, "embedding_multiplier_scale", DEFAULT_EMBEDDING_MULTIPLIER_SCALE config, "embedding_multiplier_scale", DEFAULT_EMBEDDING_MULTIPLIER_SCALE
) )
...@@ -324,7 +318,6 @@ class Grok1Model(nn.Module): ...@@ -324,7 +318,6 @@ class Grok1Model(nn.Module):
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config, quant_config=quant_config,
) )
...@@ -499,25 +492,18 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -499,25 +492,18 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.lora_config = lora_config
self.quant_config = quant_config self.quant_config = quant_config
self.model = Grok1Model( self.model = Grok1Model(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -529,7 +515,7 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -529,7 +515,7 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE
) )
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size, self.output_multiplier_scale config.vocab_size, scale=self.output_multiplier_scale
) )
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
......
...@@ -57,7 +57,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -57,7 +57,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -606,7 +605,7 @@ class HunYuanModel(nn.Module): ...@@ -606,7 +605,7 @@ class HunYuanModel(nn.Module):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
eplb_config = vllm_config.parallel_config.eplb_config eplb_config = vllm_config.parallel_config.eplb_config
enable_eplb = vllm_config.parallel_config.enable_eplb enable_eplb = vllm_config.parallel_config.enable_eplb
self.num_redundant_experts = eplb_config.num_redundant_experts self.num_redundant_experts = eplb_config.num_redundant_experts
...@@ -614,20 +613,15 @@ class HunYuanModel(nn.Module): ...@@ -614,20 +613,15 @@ class HunYuanModel(nn.Module):
self.config = config self.config = config
self.quant_config = quant_config self.quant_config = quant_config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
if get_pp_group().is_first_rank or ( if get_pp_group().is_first_rank or (
config.tie_word_embeddings and get_pp_group().is_last_rank config.tie_word_embeddings and get_pp_group().is_last_rank
): ):
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config, quant_config=quant_config,
) )
else: else:
...@@ -937,12 +931,9 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP): ...@@ -937,12 +931,9 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
self.model = HunYuanModel(vllm_config=vllm_config, prefix="model") self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.unpadded_vocab_size = config.vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -951,7 +942,7 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP): ...@@ -951,7 +942,7 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
logit_scale = getattr(config, "logit_scale", 1.0) logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size, logit_scale config.vocab_size, scale=logit_scale
) )
else: else:
self.lm_head = PPMissingLayer() self.lm_head = PPMissingLayer()
......
...@@ -330,11 +330,9 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): ...@@ -330,11 +330,9 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.quant_config = quant_config self.quant_config = quant_config
self.lora_config = lora_config
self.model = model_type( self.model = model_type(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
......
...@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ...@@ -30,7 +30,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -307,21 +306,14 @@ class JambaModel(nn.Module): ...@@ -307,21 +306,14 @@ class JambaModel(nn.Module):
model_config = vllm_config.model_config model_config = vllm_config.model_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)} extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
...@@ -492,7 +484,7 @@ class JambaForCausalLM( ...@@ -492,7 +484,7 @@ class JambaForCausalLM(
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
lora_config = vllm_config.lora_config
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
super().__init__() super().__init__()
...@@ -503,24 +495,14 @@ class JambaForCausalLM( ...@@ -503,24 +495,14 @@ class JambaForCausalLM(
self.model = JambaModel( self.model = JambaModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(config.vocab_size)
self.unpadded_vocab_size, config.vocab_size
)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors self.model.make_empty_intermediate_tensors
......
...@@ -60,7 +60,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE ...@@ -60,7 +60,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
) )
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
...@@ -347,13 +346,10 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -347,13 +346,10 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
vllm_config=sub_vllm_config, vllm_config=sub_vllm_config,
prefix=maybe_prefix(prefix, "language_model"), prefix=maybe_prefix(prefix, "language_model"),
) )
self.unpadded_vocab_size = config.text_config.vocab_size
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.text_config.hidden_size, config.text_config.hidden_size,
org_num_embeddings=self.config.text_config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
else: else:
...@@ -362,9 +358,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -362,9 +358,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
self.language_model.make_empty_intermediate_tensors self.language_model.make_empty_intermediate_tensors
) )
logit_scale = getattr(config, "logit_scale", 1.0) logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
self.unpadded_vocab_size, config.vocab_size, logit_scale
)
self.media_placeholder: int = self.config.media_placeholder_token_id self.media_placeholder: int = self.config.media_placeholder_token_id
def _parse_and_validate_image_input( def _parse_and_validate_image_input(
......
...@@ -28,7 +28,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv ...@@ -28,7 +28,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -316,16 +315,10 @@ class Lfm2Model(nn.Module): ...@@ -316,16 +315,10 @@ class Lfm2Model(nn.Module):
model_config = vllm_config.model_config model_config = vllm_config.model_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
...@@ -483,7 +476,7 @@ class Lfm2ForCausalLM( ...@@ -483,7 +476,7 @@ class Lfm2ForCausalLM(
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
lora_config = vllm_config.lora_config
assert not cache_config.enable_prefix_caching, ( assert not cache_config.enable_prefix_caching, (
"Lfm2 currently does not support prefix caching" "Lfm2 currently does not support prefix caching"
) )
...@@ -495,21 +488,9 @@ class Lfm2ForCausalLM( ...@@ -495,21 +488,9 @@ class Lfm2ForCausalLM(
) )
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.unpadded_vocab_size = self.config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size
),
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -517,9 +498,7 @@ class Lfm2ForCausalLM( ...@@ -517,9 +498,7 @@ class Lfm2ForCausalLM(
else: else:
self.lm_head = PPMissingLayer() self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(config.vocab_size)
self.unpadded_vocab_size, config.vocab_size
)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors self.model.make_empty_intermediate_tensors
......
...@@ -33,7 +33,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv ...@@ -33,7 +33,6 @@ from vllm.model_executor.layers.mamba.short_conv import ShortConv
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -423,20 +422,15 @@ class Lfm2MoeModel(nn.Module): ...@@ -423,20 +422,15 @@ class Lfm2MoeModel(nn.Module):
model_config = vllm_config.model_config model_config = vllm_config.model_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
enable_eplb = parallel_config.enable_eplb enable_eplb = parallel_config.enable_eplb
eplb_config = parallel_config.eplb_config eplb_config = parallel_config.eplb_config
self.num_redundant_experts = eplb_config.num_redundant_experts self.num_redundant_experts = eplb_config.num_redundant_experts
self.config = config self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
...@@ -662,7 +656,7 @@ class Lfm2MoeForCausalLM( ...@@ -662,7 +656,7 @@ class Lfm2MoeForCausalLM(
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
lora_config = vllm_config.lora_config
assert not cache_config.enable_prefix_caching, ( assert not cache_config.enable_prefix_caching, (
"Lfm2Moe currently does not support prefix caching" "Lfm2Moe currently does not support prefix caching"
) )
...@@ -674,21 +668,9 @@ class Lfm2MoeForCausalLM( ...@@ -674,21 +668,9 @@ class Lfm2MoeForCausalLM(
) )
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.unpadded_vocab_size = self.config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size
),
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -696,9 +678,7 @@ class Lfm2MoeForCausalLM( ...@@ -696,9 +678,7 @@ class Lfm2MoeForCausalLM(
else: else:
self.lm_head = PPMissingLayer() self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(config.vocab_size)
self.unpadded_vocab_size, config.vocab_size
)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors self.model.make_empty_intermediate_tensors
......
...@@ -15,7 +15,6 @@ from vllm.model_executor.layers.linear import QKVParallelLinear ...@@ -15,7 +15,6 @@ from vllm.model_executor.layers.linear import QKVParallelLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -252,8 +251,6 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): ...@@ -252,8 +251,6 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.config.draft_vocab_size, self.config.draft_vocab_size,
self.config.hidden_size, self.config.hidden_size,
org_num_embeddings=self.config.draft_vocab_size,
padding_size=(DEFAULT_VOCAB_PADDING_SIZE),
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
......
...@@ -554,7 +554,6 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -554,7 +554,6 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
super().__init__() super().__init__()
config = FlashConfig(**vllm_config.model_config.hf_config.__dict__) config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
config.intermediate_size = ( config.intermediate_size = (
...@@ -562,7 +561,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -562,7 +561,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
if hasattr(config, "ffn_hidden_size") if hasattr(config, "ffn_hidden_size")
else config.intermediate_size else config.intermediate_size
) )
self.lora_config = lora_config
self.quant_config = quant_config self.quant_config = quant_config
self.model = FlashModel( self.model = FlashModel(
......
...@@ -21,7 +21,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ...@@ -21,7 +21,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
) )
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -110,18 +109,12 @@ class MambaModel(nn.Module): ...@@ -110,18 +109,12 @@ class MambaModel(nn.Module):
is_lora_enabled = bool(lora_config) is_lora_enabled = bool(lora_config)
self.config = config self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
self.start_layer, self.end_layer, self.layers = make_layers( self.start_layer, self.end_layer, self.layers = make_layers(
...@@ -199,7 +192,7 @@ class MambaForCausalLM( ...@@ -199,7 +192,7 @@ class MambaForCausalLM(
): ):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
lora_config = vllm_config.lora_config
self.scheduler_config = vllm_config.scheduler_config self.scheduler_config = vllm_config.scheduler_config
super().__init__() super().__init__()
...@@ -209,27 +202,17 @@ class MambaForCausalLM( ...@@ -209,27 +202,17 @@ class MambaForCausalLM(
self.backbone = MambaModel( self.backbone = MambaModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
if config.tie_word_embeddings: if config.tie_word_embeddings:
self.lm_head = self.backbone.embeddings self.lm_head = self.backbone.embeddings
else: else:
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(config.vocab_size)
self.unpadded_vocab_size, config.vocab_size
)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.backbone.make_empty_intermediate_tensors self.backbone.make_empty_intermediate_tensors
......
...@@ -20,7 +20,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ...@@ -20,7 +20,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
) )
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -107,18 +106,12 @@ class Mamba2Model(nn.Module): ...@@ -107,18 +106,12 @@ class Mamba2Model(nn.Module):
assert not is_lora_enabled assert not is_lora_enabled
self.config = config self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
self.start_layer, self.end_layer, self.layers = make_layers( self.start_layer, self.end_layer, self.layers = make_layers(
...@@ -238,7 +231,7 @@ class Mamba2ForCausalLM( ...@@ -238,7 +231,7 @@ class Mamba2ForCausalLM(
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
lora_config = vllm_config.lora_config
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
super().__init__() super().__init__()
...@@ -249,27 +242,16 @@ class Mamba2ForCausalLM( ...@@ -249,27 +242,16 @@ class Mamba2ForCausalLM(
self.backbone = Mamba2Model( self.backbone = Mamba2Model(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
) )
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
if config.tie_word_embeddings: if config.tie_word_embeddings:
self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings) self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings)
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(config.vocab_size)
self.unpadded_vocab_size, config.vocab_size
)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.backbone.make_empty_intermediate_tensors self.backbone.make_empty_intermediate_tensors
......
...@@ -9,7 +9,6 @@ import torch.nn as nn ...@@ -9,7 +9,6 @@ import torch.nn as nn
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...@@ -70,14 +69,11 @@ class Medusa(nn.Module): ...@@ -70,14 +69,11 @@ class Medusa(nn.Module):
) )
self.orig_vocab_size = config.vocab_size self.orig_vocab_size = config.vocab_size
self.truncated_vocab_size = config.truncated_vocab_size self.truncated_vocab_size = config.truncated_vocab_size
self.unpadded_vocab_size = self.truncated_vocab_size
if getattr(config, "original_lm_head", False): if getattr(config, "original_lm_head", False):
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, self.truncated_vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=self.truncated_vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
self.lm_heads = [self.lm_head for _ in range(self.config.num_heads)] self.lm_heads = [self.lm_head for _ in range(self.config.num_heads)]
...@@ -85,10 +81,8 @@ class Medusa(nn.Module): ...@@ -85,10 +81,8 @@ class Medusa(nn.Module):
self.lm_heads = nn.ModuleList( self.lm_heads = nn.ModuleList(
[ [
ParallelLMHead( ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=self.truncated_vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
prefix=maybe_prefix(prefix, f"lm_heads.{i}"), prefix=maybe_prefix(prefix, f"lm_heads.{i}"),
) )
for i in range(self.config.num_heads) for i in range(self.config.num_heads)
...@@ -97,7 +91,7 @@ class Medusa(nn.Module): ...@@ -97,7 +91,7 @@ class Medusa(nn.Module):
logit_scale = getattr(config, "logit_scale", 1.0) logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, self.truncated_vocab_size, logit_scale config.vocab_size, self.truncated_vocab_size, logit_scale
) )
# Token map is a idx to token mapping to reduce the vocab size for # Token map is a idx to token mapping to reduce the vocab size for
......
...@@ -151,10 +151,8 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module): ...@@ -151,10 +151,8 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
nn.Module.__init__(self) nn.Module.__init__(self)
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.lora_config = lora_config
self.quant_config = quant_config self.quant_config = quant_config
......
...@@ -55,7 +55,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -55,7 +55,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -405,22 +404,16 @@ class MiniCPMModel(nn.Module): ...@@ -405,22 +404,16 @@ class MiniCPMModel(nn.Module):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.cache_config = cache_config self.cache_config = cache_config
self.quant_config = quant_config self.quant_config = quant_config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
self.num_experts = getattr(self.config, "num_experts", 0) self.num_experts = getattr(self.config, "num_experts", 0)
self._init_layers(prefix, config, cache_config, quant_config) self._init_layers(prefix, config, cache_config, quant_config)
...@@ -588,13 +581,13 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): ...@@ -588,13 +581,13 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.prefix = prefix self.prefix = prefix
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.config = config self.config = config
self.lora_config = lora_config
self.cache_config = cache_config self.cache_config = cache_config
self.quant_config = quant_config self.quant_config = quant_config
...@@ -602,18 +595,9 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): ...@@ -602,18 +595,9 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
) )
unpadded_vocab_size = config.vocab_size
if lora_config:
unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -621,7 +605,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): ...@@ -621,7 +605,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
self.scale_width = self.config.hidden_size / self.config.dim_model_base self.scale_width = self.config.hidden_size / self.config.dim_model_base
self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size) self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors self.model.make_empty_intermediate_tensors
) )
......
...@@ -37,7 +37,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm ...@@ -37,7 +37,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -151,18 +150,13 @@ class EagleMiniCPMModel(nn.Module): ...@@ -151,18 +150,13 @@ class EagleMiniCPMModel(nn.Module):
config = vllm_config.speculative_config.draft_model_config.hf_config config = vllm_config.speculative_config.draft_model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.cache_config = cache_config self.cache_config = cache_config
self.quant_config = quant_config self.quant_config = quant_config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) self.vocab_size = config.vocab_size
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size
self.fc = torch.nn.Linear( self.fc = torch.nn.Linear(
self.config.hidden_size * 2, self.config.hidden_size, bias=False self.config.hidden_size * 2, self.config.hidden_size, bias=False
) )
...@@ -171,7 +165,6 @@ class EagleMiniCPMModel(nn.Module): ...@@ -171,7 +165,6 @@ class EagleMiniCPMModel(nn.Module):
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
self.vocab_size, self.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
) )
self.num_experts = getattr(self.config, "num_experts", 0) self.num_experts = getattr(self.config, "num_experts", 0)
self._init_layers(prefix, config, cache_config, quant_config, start_layer) self._init_layers(prefix, config, cache_config, quant_config, start_layer)
...@@ -321,12 +314,11 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -321,12 +314,11 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
config = vllm_config.speculative_config.draft_model_config.hf_config config = vllm_config.speculative_config.draft_model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.prefix = prefix self.prefix = prefix
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.config = config self.config = config
self.lora_config = lora_config
self.cache_config = cache_config self.cache_config = cache_config
self.quant_config = quant_config self.quant_config = quant_config
...@@ -340,18 +332,9 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -340,18 +332,9 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
start_layer=target_layer_num, start_layer=target_layer_num,
) )
unpadded_vocab_size = config.vocab_size
if lora_config:
unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
unpadded_vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
quant_config=quant_config, quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
...@@ -359,7 +342,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -359,7 +342,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
self.scale_width = self.config.hidden_size / self.config.dim_model_base self.scale_width = self.config.hidden_size / self.config.dim_model_base
self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size) self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors self.model.make_empty_intermediate_tensors
) )
......
...@@ -41,7 +41,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ...@@ -41,7 +41,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead, ParallelLMHead,
VocabParallelEmbedding, VocabParallelEmbedding,
) )
...@@ -669,16 +668,14 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): ...@@ -669,16 +668,14 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
lora_config = vllm_config.lora_config
self.config = config self.config = config
self.lora_config = lora_config
if not hasattr(config, "sliding_window"): if not hasattr(config, "sliding_window"):
config.sliding_window = None config.sliding_window = None
self.CONCAT_FFN = True self.CONCAT_FFN = True
self.unpadded_vocab_size = self.config.vocab_size
if hasattr(vllm_config.model_config, "max_model_len"): if hasattr(vllm_config.model_config, "max_model_len"):
self.config.max_model_len = vllm_config.model_config.max_model_len self.config.max_model_len = vllm_config.model_config.max_model_len
self.model = MiniMaxText01Model( self.model = MiniMaxText01Model(
...@@ -686,15 +683,13 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): ...@@ -686,15 +683,13 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
) )
if get_pp_group().is_last_rank: if get_pp_group().is_last_rank:
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
self.unpadded_vocab_size, config.vocab_size,
self.config.hidden_size, self.config.hidden_size,
org_num_embeddings=self.config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
prefix=maybe_prefix(prefix, "lm_head"), prefix=maybe_prefix(prefix, "lm_head"),
) )
self.logits_processor = LogitsProcessor( self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, self.config.vocab_size config.vocab_size, self.config.vocab_size
) )
else: else:
......
...@@ -123,7 +123,6 @@ class MLPSpeculator(nn.Module): ...@@ -123,7 +123,6 @@ class MLPSpeculator(nn.Module):
VocabParallelEmbedding( VocabParallelEmbedding(
config.vocab_size, config.vocab_size,
self.inner_dim, self.inner_dim,
org_num_embeddings=config.vocab_size,
) )
for _ in range(self.max_speculative_tokens) for _ in range(self.max_speculative_tokens)
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment