[Model] Granite-4 support loading quantized checkpoint (#22925)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>

[Model] Granite-4 support loading quantized checkpoint (#22925)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
6cd69f51 · Chih-Chieh Yang · GitHub · 8ad7285e · 6cd69f51
Unverified Commit 6cd69f51 authored Aug 15, 2025 by Chih-Chieh Yang Committed by GitHub Aug 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

vllm/model_executor/models/granitemoehybrid.py vllm/model_executor/models/granitemoehybrid.py +6 -2

No files found.
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -471,7 +471,10 @@ class GraniteMoeHybridModel(nn.Module):
            # Mapping different experts' layout:
            #  from HF (input_linear, output_linear, router)
            #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
+            # The renaming and parameter loading logic is the same for weight
+            # and weight_scale tensors so we can reuse them without issues.
+            if (n.endswith('.block_sparse_moe.input_linear.weight') or
+                    n.endswith('.block_sparse_moe.input_linear.weight_scale')):
                for e in range(p.size(0)):
                    w1_name = n.replace(
                        '.block_sparse_moe.input_linear.weight',
@@ -490,7 +493,8 @@ class GraniteMoeHybridModel(nn.Module):
                                 w3_name,
                                 shard_id='w3',
                                 expert_id=e)
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+            elif (n.endswith('.block_sparse_moe.output_linear.weight') or
+                  n.endswith('.block_sparse_moe.output_linear.weight_scale')):
                for e in range(p.size(0)):
                    w2_name = n.replace(
                        '.block_sparse_moe.output_linear.weight',