Commit a56e3da7 authored by wangmin6's avatar wangmin6
Browse files

Merge branch 'wanglong3-v0.15.1-dev-patch-76439' into 'v0.15.1-dev'

fix: fix bug http://hpczentao.sugon.com/bug-view-118388.html

See merge request dcutoolkit/deeplearing/vllm!491
parents d2c4f48b 8bf99b0b
......@@ -1249,7 +1249,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
# `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
# the bmm's in 16-bit, the extra memory overhead of this is fairly low
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
if (envs.VLLM_USE_NN or self.use_llama_nn) and isinstance(self.kv_b_proj.quant_method, UnquantizedLinearMethod):
if (envs.VLLM_USE_NN or self.use_llama_nn): # and isinstance(self.kv_b_proj.quant_method, UnquantizedLinearMethod):
kv_b_proj_weight = get_and_maybe_dequant_weights(
self.kv_b_proj, out_dtype=act_dtype
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment