Unverified Commit 62830211 authored by Pavani Majety's avatar Pavani Majety Committed by GitHub
Browse files

[Bugfix] Fix KV Scale loading for MLA Models (#35430)


Signed-off-by: default avatarPavani Majety <pmajety@nvidia.com>
parent 01923eec
......@@ -12,7 +12,7 @@ from vllm.logger import init_logger
from vllm.model_executor.kernels.linear import (
init_fp8_linear_kernel,
)
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.attention import Attention, MLAAttention
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
......@@ -183,7 +183,7 @@ class ModelOptQuantConfigBase(QuantizationConfig):
self, layer: torch.nn.Module, prefix: str
) -> "QuantizeMethodBase | None":
# handle kv-cache first so we can focus only on weight quantization thereafter
if isinstance(layer, Attention):
if isinstance(layer, (Attention, MLAAttention)):
return self.KVCacheMethodCls(self)
# handle exclusion
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment