Unverified Commit 62830211 authored by Pavani Majety's avatar Pavani Majety Committed by GitHub
Browse files

[Bugfix] Fix KV Scale loading for MLA Models (#35430)


Signed-off-by: default avatarPavani Majety <pmajety@nvidia.com>
parent 01923eec
...@@ -12,7 +12,7 @@ from vllm.logger import init_logger ...@@ -12,7 +12,7 @@ from vllm.logger import init_logger
from vllm.model_executor.kernels.linear import ( from vllm.model_executor.kernels.linear import (
init_fp8_linear_kernel, init_fp8_linear_kernel,
) )
from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention import Attention, MLAAttention
from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
...@@ -183,7 +183,7 @@ class ModelOptQuantConfigBase(QuantizationConfig): ...@@ -183,7 +183,7 @@ class ModelOptQuantConfigBase(QuantizationConfig):
self, layer: torch.nn.Module, prefix: str self, layer: torch.nn.Module, prefix: str
) -> "QuantizeMethodBase | None": ) -> "QuantizeMethodBase | None":
# handle kv-cache first so we can focus only on weight quantization thereafter # handle kv-cache first so we can focus only on weight quantization thereafter
if isinstance(layer, Attention): if isinstance(layer, (Attention, MLAAttention)):
return self.KVCacheMethodCls(self) return self.KVCacheMethodCls(self)
# handle exclusion # handle exclusion
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment