Merge branch 'wanglong3-v0.15.1-dev-patch-76439' into 'v0.15.1-dev'

fix: fix bug http://hpczentao.sugon.com/bug-view-118388.html See merge request dcutoolkit/deeplearing/vllm!491

Merge branch 'wanglong3-v0.15.1-dev-patch-76439' into 'v0.15.1-dev'
fix: fix bug http://hpczentao.sugon.com/bug-view-118388.html See merge request dcutoolkit/deeplearing/vllm!491
a56e3da7 · wangmin6 · d2c4f48b · 8bf99b0b · a56e3da7
Commit a56e3da7 authored Mar 12, 2026 by wangmin6
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/model_executor/layers/attention/mla_attention.py vllm/model_executor/layers/attention/mla_attention.py +1 -1

No files found.
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1249,7 +1249,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
        from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-        if (envs.VLLM_USE_NN or self.use_llama_nn) and isinstance(self.kv_b_proj.quant_method, UnquantizedLinearMethod):
+        if (envs.VLLM_USE_NN or self.use_llama_nn): #  and isinstance(self.kv_b_proj.quant_method, UnquantizedLinearMethod):
            kv_b_proj_weight = get_and_maybe_dequant_weights(
                self.kv_b_proj, out_dtype=act_dtype
            )