Fix CompressedTensorsWNA16MoE with grouped scales (#13769)

4d251ad0 · Michael Goin · GitHub · 18e50593 · 4d251ad0
Unverified Commit 4d251ad0 authored Feb 25, 2025 by Michael Goin Committed by GitHub Feb 25, 2025
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py ...quantization/compressed_tensors/compressed_tensors_moe.py +2 -1

No files found.
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -527,7 +527,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
        replace_tensor("w13_weight_scale", marlin_w13_scales)
        marlin_w2_scales = marlin_moe_permute_scales(
            layer.w2_weight_scale,
-            layer.w2_weight_scale.shape[1] * self.packed_factor,
+            layer.w2_weight_scale.shape[1] *
+            (self.group_size if self.group_size != -1 else self.packed_factor),
            size_k2,
            self.group_size,
            self.num_bits,