[Bugfix] Properly initialize `PerTensorScaleParameter` for fused-on-disk checkpoints (#39765)

Signed-off-by: Hemmi Shinichi <shemmi@preferred.jp> Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

[Bugfix] Properly initialize `PerTensorScaleParameter` for fused-on-disk checkpoints (#39765)
Signed-off-by: Hemmi Shinichi <shemmi@preferred.jp> Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
fcb31c1a · Shinichi Hemmi · GitHub · d886c26d · fcb31c1a
Unverified Commit fcb31c1a authored Apr 20, 2026 by Shinichi Hemmi Committed by GitHub Apr 20, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 6 deletions

vllm/model_executor/layers/linear.py vllm/model_executor/layers/linear.py +18 -6

No files found.
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -916,9 +916,15 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                            loaded_weight=loaded_weight, shard_id=idx
                        )
                else:
-                    param.load_merged_column_weight(
+                    # When weights are already fused on disk (e.g. Phi-3's
-                        loaded_weight=loaded_weight, shard_id=0
+                    # gate_up_proj), there is only a single scale for the
-                    )
+                    # entire fused matrix. Fill all slots with this scale
+                    # to ensure that any subsequent reduction (like .max())
+                    # works correctly while preserving the parameter shape.
+                    for idx in range(param.data.shape[0]):
+                        param.load_merged_column_weight(
+                            loaded_weight=loaded_weight, shard_id=idx
+                        )
                return
            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                param.load_merged_column_weight(loaded_weight=loaded_weight)
@@ -1130,9 +1136,15 @@ class QKVParallelLinear(ColumnParallelLinear):
        self.validate_shard_id(loaded_shard_id)
        if loaded_shard_id is None:  # special case for certain models
            if isinstance(param, PerTensorScaleParameter):
-                param.load_qkv_weight(
+                # When weights are already fused on disk (e.g. Phi-3's
-                    loaded_weight=loaded_weight, shard_id=0, tp_rank=self.tp_rank
+                # qkv_proj), there is only a single scale for the entire
-                )
+                # fused matrix. Fill all slots (q, k, v) with this scale
+                # to ensure that any subsequent reduction (like .max())
+                # works correctly while preserving the parameter shape.
+                for idx in range(param.data.shape[0]):
+                    param.load_qkv_weight(
+                        loaded_weight=loaded_weight, shard_id=idx, tp_rank=self.tp_rank
+                    )
                return
            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                param.load_qkv_weight(loaded_weight=loaded_weight, tp_rank=self.tp_rank)