[Bugfix][MTP] Fix GLM4 MoE fp8 loading with MTP on (#31757)

Signed-off-by: Andy Liu <andyliu@roblox.com>

[Bugfix][MTP] Fix GLM4 MoE fp8 loading with MTP on (#31757)
Signed-off-by: Andy Liu <andyliu@roblox.com>
d111bc53 · Andy Liu · GitHub · 0790f076 · d111bc53
Unverified Commit d111bc53 authored Jan 07, 2026 by Andy Liu Committed by GitHub Jan 07, 2026
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

vllm/model_executor/models/glm4_moe_mtp.py vllm/model_executor/models/glm4_moe_mtp.py +6 -1

No files found.
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -106,7 +106,7 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
    ) -> torch.Tensor:
        assert inputs_embeds is not None
        # masking inputs at position 0, as not needed by MTP
-        inputs_embeds[positions == 0] = 0
+        inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds)
        inputs_embeds = self.enorm(inputs_embeds)
        previous_hidden_states = self.hnorm(previous_hidden_states)
@@ -268,6 +268,11 @@ class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
                if spec_layer is None:
                    continue
                name = self._rewrite_spec_layer_name(spec_layer, name)
+            # Some checkpoints include weight scale tensors for the LM head even
+            # when the quantized head isn't built. Skip them if the model does
+            # not expose a matching parameter to avoid KeyError during load.
+            if name.endswith(".weight_scale") and name not in params_dict:
+                continue
            for param_name, weight_name, shard_id in stacked_params_mapping:
                # Skip non-stacked layers and experts (experts handled below).
                if weight_name not in name: