Squelch MLA warning for Compressed-Tensors Models (#12704)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>

Squelch MLA warning for Compressed-Tensors Models (#12704)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
6dd5e528 · Kyle Sayers · GitHub · c11de33d · 6dd5e528
Unverified Commit 6dd5e528 authored Feb 03, 2025 by Kyle Sayers Committed by GitHub Feb 03, 2025
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

vllm/config.py vllm/config.py +4 -2

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -986,6 +986,9 @@ class ModelConfig:

    @property
    def use_mla(self) -> bool:
+        if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
+            return False
+
        if self.quantization is not None and self.quantization not in [\
            "fp8", "compressed-tensors"]:
            logger.warning(
@@ -1012,8 +1015,7 @@ class ModelConfig:
                        quant_config)
                    return False

-        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
-        return use_mla
+        return True

    @property
    def supported_runner_types(self) -> Set[RunnerType]: