update MLACommonBaseImpl get_and_maybe_dequant_weights

b8412df6 · zhuwenwen · 24962bed · b8412df6
Commit b8412df6 authored Dec 04, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/common.py +5 -2

No files found.
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1097,12 +1097,15 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
                del eye
                # standardize to (output, input)
                return dequant_weights.T
-            return layer.weight
+            return layer.weight if not envs.VLLM_USE_NN else layer.weight.T
        # we currently do not have quantized bmm's which are needed for
        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
-        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        if self.use_llama_nn and isinstance(self.kv_b_proj.quant_method, UnquantizedLinearMethod):
+            kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj)
+        else:
+            kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
        assert kv_b_proj_weight.shape == (
            self.kv_lora_rank,
            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (