Commit c80f5968 authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.15.1-dev' into v0.15.1-dev

# Conflicts:
#	vllm/model_executor/layers/fused_moe/config.py
#	vllm/model_executor/layers/fused_moe/layer.py
#	vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_marlin.py
parents 74306deb 530e785f
...@@ -1650,7 +1650,7 @@ class Ernie4_5_VLMoeForConditionalGeneration( ...@@ -1650,7 +1650,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -1686,4 +1686,4 @@ class Ernie4_5_VLMoeForConditionalGeneration( ...@@ -1686,4 +1686,4 @@ class Ernie4_5_VLMoeForConditionalGeneration(
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self) loader = AutoWeightsLoader(self)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
\ No newline at end of file
...@@ -565,7 +565,7 @@ class Ernie4_5_VLMoeModel(nn.Module): ...@@ -565,7 +565,7 @@ class Ernie4_5_VLMoeModel(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -646,7 +646,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): ...@@ -646,7 +646,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -800,4 +800,4 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): ...@@ -800,4 +800,4 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
) )
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
loaded_params.add(name) loaded_params.add(name)
return loaded_params return loaded_params
\ No newline at end of file
...@@ -164,7 +164,7 @@ class ErnieMTP(nn.Module): ...@@ -164,7 +164,7 @@ class ErnieMTP(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
...@@ -275,4 +275,4 @@ class ErnieMTP(nn.Module): ...@@ -275,4 +275,4 @@ class ErnieMTP(nn.Module):
name = name.replace( name = name.replace(
"model.mtp_block.0.", f"model.layers.{layer_idx}.mtp_block." "model.mtp_block.0.", f"model.layers.{layer_idx}.mtp_block."
) )
return name return name
\ No newline at end of file
...@@ -496,7 +496,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -496,7 +496,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -521,4 +521,4 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -521,4 +521,4 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# processed with quantization, LoRA, fine-tuning, etc. # processed with quantization, LoRA, fine-tuning, etc.
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -490,7 +490,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -490,7 +490,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -515,4 +515,4 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -515,4 +515,4 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# processed with quantization, LoRA, fine-tuning, etc. # processed with quantization, LoRA, fine-tuning, etc.
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -549,7 +549,7 @@ class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -549,7 +549,7 @@ class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -576,4 +576,4 @@ class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -576,4 +576,4 @@ class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
["lm_head.", "mtp."] if self.config.tie_word_embeddings else ["mtp."] ["lm_head.", "mtp."] if self.config.tie_word_embeddings else ["mtp."]
), ),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -210,7 +210,7 @@ class FalconAttention(nn.Module): ...@@ -210,7 +210,7 @@ class FalconAttention(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.attn", prefix=f"{prefix}.attn",
) )
self.quant_method = None self.quant_method = None
if quant_config is not None: if quant_config is not None:
self.quant_method=quant_config.get_name() self.quant_method=quant_config.get_name()
self.quant_config=quant_config self.quant_config=quant_config
...@@ -423,7 +423,7 @@ class FalconModel(nn.Module): ...@@ -423,7 +423,7 @@ class FalconModel(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None, intermediate_tensors: IntermediateTensors | None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
......
...@@ -459,7 +459,7 @@ class FalconH1Model(nn.Module): ...@@ -459,7 +459,7 @@ class FalconH1Model(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -602,7 +602,7 @@ class FalconH1ForCausalLM( ...@@ -602,7 +602,7 @@ class FalconH1ForCausalLM(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -678,4 +678,4 @@ class FalconH1ForCausalLM( ...@@ -678,4 +678,4 @@ class FalconH1ForCausalLM(
if self.tie_word_embeddings: if self.tie_word_embeddings:
loaded_params.add("lm_head.weight") loaded_params.add("lm_head.weight")
return loaded_params return loaded_params
\ No newline at end of file
...@@ -340,7 +340,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -340,7 +340,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -365,4 +365,4 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -365,4 +365,4 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self) loader = AutoWeightsLoader(self)
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -297,7 +297,7 @@ class GemmaModel(nn.Module): ...@@ -297,7 +297,7 @@ class GemmaModel(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None, intermediate_tensors: IntermediateTensors | None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -400,7 +400,7 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -400,7 +400,7 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -422,4 +422,4 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -422,4 +422,4 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self, self,
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -410,7 +410,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -410,7 +410,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -432,4 +432,4 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -432,4 +432,4 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self, self,
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -494,7 +494,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -494,7 +494,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -517,4 +517,4 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -517,4 +517,4 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self, self,
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -618,7 +618,7 @@ class Gemma3ForConditionalGeneration( ...@@ -618,7 +618,7 @@ class Gemma3ForConditionalGeneration(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -655,42 +655,4 @@ class Gemma3ForConditionalGeneration( ...@@ -655,42 +655,4 @@ class Gemma3ForConditionalGeneration(
language_model="language_model", language_model="language_model",
connector="multi_modal_projector", connector="multi_modal_projector",
tower_model="vision_tower", tower_model="vision_tower",
) )
\ No newline at end of file
def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
"""
Calculate the number of tokens output by the vision encoder.
The vision encoder processes images into patch embeddings. For Gemma3,
the relationship between prompt placeholder tokens and actual vision
encoder output tokens depends on the patch grid size.
Args:
num_image_tokens: Number of image placeholder tokens in the prompt
(typically mm_tokens_per_image per image)
Returns:
Number of tokens output by the vision encoder
"""
# For Gemma3, the vision encoder outputs tokens_per_side x tokens_per_side
# tokens per image. Since num_image_tokens represents the number of
# connector output tokens (mm_tokens_per_image = 256), and tokens_per_side
# is sqrt(256) = 16, we need to account for the token expansion.
# Based on empirical testing, the multiplier of 16 works correctly.
return num_image_tokens * 16
def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
"""
Calculate the number of tokens output by the multimodal connector.
The connector applies projection and normalization but maintains the
token count for Gemma3.
Args:
num_vision_tokens: Number of tokens from vision encoder
Returns:
Number of tokens after connector processing
"""
# The Gemma3 connector maintains a 1:1 token mapping
return num_vision_tokens
...@@ -704,7 +704,7 @@ class Gemma3nSelfDecoder(nn.Module): ...@@ -704,7 +704,7 @@ class Gemma3nSelfDecoder(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
per_layer_inputs: torch.Tensor | None = None, per_layer_inputs: torch.Tensor | None = None,
...@@ -887,7 +887,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): ...@@ -887,7 +887,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
def fast_prefill_forward( def fast_prefill_forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
per_layer_inputs: torch.Tensor | None = None, per_layer_inputs: torch.Tensor | None = None,
...@@ -964,7 +964,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): ...@@ -964,7 +964,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
def normal_forward( def normal_forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
per_layer_inputs: torch.Tensor | None = None, per_layer_inputs: torch.Tensor | None = None,
...@@ -1131,7 +1131,7 @@ class Gemma3nForCausalLM(nn.Module): ...@@ -1131,7 +1131,7 @@ class Gemma3nForCausalLM(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
*, *,
per_layer_inputs: torch.Tensor | None = None, per_layer_inputs: torch.Tensor | None = None,
...@@ -1163,4 +1163,4 @@ class Gemma3nForCausalLM(nn.Module): ...@@ -1163,4 +1163,4 @@ class Gemma3nForCausalLM(nn.Module):
["embed_audio.", "embed_vision.", "audio_tower.", "vision_tower."] ["embed_audio.", "embed_vision.", "audio_tower.", "vision_tower."]
), ),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
\ No newline at end of file
...@@ -707,7 +707,7 @@ class Gemma3nForConditionalGeneration( ...@@ -707,7 +707,7 @@ class Gemma3nForConditionalGeneration(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -815,4 +815,4 @@ class Gemma3nForConditionalGeneration( ...@@ -815,4 +815,4 @@ class Gemma3nForConditionalGeneration(
sample_rate=16000, sample_rate=16000,
# TODO enable chunking after more thorough testing. # TODO enable chunking after more thorough testing.
min_energy_split_window_size=None, min_energy_split_window_size=None,
) )
\ No newline at end of file
...@@ -41,22 +41,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -41,22 +41,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader,
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backend import AttentionType
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .llama import LlamaMLP as Glm4MLP from .llama import LlamaMLP as Glm4MLP
from .llama import LlamaModel from .llama import LlamaModel
from .utils import ( from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
AutoWeightsLoader,
PPMissingLayer,
is_pp_missing_parameter,
maybe_prefix,
)
from vllm.utils import W8a8GetCacheJSON from vllm.utils import W8a8GetCacheJSON
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
...@@ -94,12 +85,7 @@ class Glm4Attention(nn.Module): ...@@ -94,12 +85,7 @@ class Glm4Attention(nn.Module):
assert tp_size % self.total_num_kv_heads == 0 assert tp_size % self.total_num_kv_heads == 0
rope_params = getattr(config, "rope_parameters", None) rope_params = getattr(config, "rope_parameters", None)
if isinstance(rope_params, dict) and "partial_rotary_factor" in rope_params: config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
config.rope_parameters.setdefault(
"partial_rotary_factor", rope_params["partial_rotary_factor"]
)
else:
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
self.head_dim = head_dim or hidden_size // self.total_num_heads self.head_dim = head_dim or hidden_size // self.total_num_heads
...@@ -242,73 +228,6 @@ class Glm4Model(LlamaModel): ...@@ -242,73 +228,6 @@ class Glm4Model(LlamaModel):
vllm_config=vllm_config, prefix=prefix, layer_type=Glm4DecoderLayer vllm_config=vllm_config, prefix=prefix, layer_type=Glm4DecoderLayer
) )
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
(".qkv_proj", ".q_proj", "q"),
(".qkv_proj", ".k_proj", "k"),
(".qkv_proj", ".v_proj", "v"),
(".gate_up_proj", ".gate_proj", 0),
(".gate_up_proj", ".up_proj", 1),
]
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()
for name, loaded_weight in weights:
spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
if spec_layer is not None:
continue
if "rotary_emb.inv_freq" in name:
continue
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
if self.quant_config is not None and (
scale_name := self.quant_config.get_cache_scale(name)
):
# Loading kv cache quantization scales
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
loaded_weight = (
loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
)
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
if "scale" in name or "zero_point" in name:
# Remapping the name of FP8 kv-scale or zero point.
name = maybe_remap_kv_scale_name(name, params_dict)
if name is None:
continue
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
if is_pp_missing_parameter(name, self):
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
if is_pp_missing_parameter(name, self):
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
return loaded_params
class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
packed_modules_mapping = { packed_modules_mapping = {
...@@ -371,7 +290,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -371,7 +290,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -394,16 +313,3 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -394,16 +313,3 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
def get_spec_layer_idx_from_weight_name(
config: Glm4Config, weight_name: str
) -> int | None:
if hasattr(config, "num_nextn_predict_layers") and (
config.num_nextn_predict_layers > 0
):
layer_idx = config.num_hidden_layers
for i in range(config.num_nextn_predict_layers):
if f"layers.{layer_idx + i}." in weight_name:
return layer_idx + i
return None
...@@ -24,8 +24,7 @@ ...@@ -24,8 +24,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Inference-only GLM-4.1V & GLM-4.6V-Flash, AutoGLM-Phone-9B model """Inference-only GLM-4V model compatible with HuggingFace weights."""
compatible with HuggingFace weights."""
import os import os
import itertools import itertools
...@@ -1739,7 +1738,7 @@ class Glm4vForConditionalGeneration( ...@@ -1739,7 +1738,7 @@ class Glm4vForConditionalGeneration(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
......
...@@ -451,7 +451,7 @@ class Glm4MoeModel(nn.Module): ...@@ -451,7 +451,7 @@ class Glm4MoeModel(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -687,7 +687,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExper ...@@ -687,7 +687,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExper
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -722,4 +722,4 @@ def get_spec_layer_idx_from_weight_name( ...@@ -722,4 +722,4 @@ def get_spec_layer_idx_from_weight_name(
for i in range(config.num_nextn_predict_layers): for i in range(config.num_nextn_predict_layers):
if f"layers.{layer_idx + i}." in weight_name: if f"layers.{layer_idx + i}." in weight_name:
return layer_idx + i return layer_idx + i
return None return None
\ No newline at end of file
...@@ -264,7 +264,7 @@ class Glm4MoeLiteModel(nn.Module): ...@@ -264,7 +264,7 @@ class Glm4MoeLiteModel(nn.Module):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -596,7 +596,7 @@ class Glm4MoeLiteForCausalLM( ...@@ -596,7 +596,7 @@ class Glm4MoeLiteForCausalLM(
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
...@@ -639,4 +639,4 @@ def get_spec_layer_idx_from_weight_name( ...@@ -639,4 +639,4 @@ def get_spec_layer_idx_from_weight_name(
for i in range(config.num_nextn_predict_layers): for i in range(config.num_nextn_predict_layers):
if f"layers.{layer_idx + i}." in weight_name: if f"layers.{layer_idx + i}." in weight_name:
return layer_idx + i return layer_idx + i
return None return None
\ No newline at end of file
...@@ -230,7 +230,7 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): ...@@ -230,7 +230,7 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
def forward( def forward(
self, self,
input_ids: torch.Tensor | None, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
...@@ -461,4 +461,4 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): ...@@ -461,4 +461,4 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
elif shared_weight: elif shared_weight:
# treat shared weights as top level weights # treat shared weights as top level weights
name = name.replace(f"model.layers.{spec_layer}.", "model.") name = name.replace(f"model.layers.{spec_layer}.", "model.")
return name return name
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment