Unverified Commit 0dd63639 authored by Andy Liu's avatar Andy Liu Committed by GitHub
Browse files

[MTP][GLM][Bugfix] Fixed .weight_scale loading logic that dropped MTP...


[MTP][GLM][Bugfix] Fixed .weight_scale loading logic that dropped MTP prediction accuracy with fp8+mtp (#32101)
Signed-off-by: default avatarAndy Liu <andyliu@roblox.com>
parent ef96fa3f
...@@ -268,11 +268,6 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): ...@@ -268,11 +268,6 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
if spec_layer is None: if spec_layer is None:
continue continue
name = self._rewrite_spec_layer_name(spec_layer, name) name = self._rewrite_spec_layer_name(spec_layer, name)
# Some checkpoints include weight scale tensors for the LM head even
# when the quantized head isn't built. Skip them if the model does
# not expose a matching parameter to avoid KeyError during load.
if name.endswith(".weight_scale") and name not in params_dict:
continue
for param_name, weight_name, shard_id in stacked_params_mapping: for param_name, weight_name, shard_id in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below). # Skip non-stacked layers and experts (experts handled below).
if weight_name not in name: if weight_name not in name:
...@@ -315,6 +310,12 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): ...@@ -315,6 +310,12 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
# Skip loading extra bias for GPTQ models. # Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict: if name.endswith(".bias") and name not in params_dict:
continue continue
# Some checkpoints include weight scale tensors for the
# LM head even when the quantized head isn't built. Skip
# them if the model does not expose a matching parameter
# to avoid KeyError during load.
if name.endswith(".weight_scale") and name not in params_dict:
continue
# According to DeepSeek-V3 Technical Report, MTP modules # According to DeepSeek-V3 Technical Report, MTP modules
# shares embedding layer. We only load the first weights. # shares embedding layer. We only load the first weights.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment