Update modelopt KV cache quantization resolution to new scheme (#31895)

Signed-off-by: Roi Koren <roik@nvidia.com>

Update modelopt KV cache quantization resolution to new scheme (#31895)
Signed-off-by: Roi Koren <roik@nvidia.com>
0c961487 · roikoren755 · GitHub · 583a90e0 · 0c961487
Unverified Commit 0c961487 authored Jan 10, 2026 by roikoren755 Committed by GitHub Jan 10, 2026
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 2 deletions

vllm/utils/torch_utils.py vllm/utils/torch_utils.py +21 -2

No files found.
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -219,9 +219,28 @@ def get_kv_cache_quant_algo_string(quant_cfg: dict[str, Any]) -> str | None:
    if quant_method.startswith("modelopt"):
        quantization_inner = quant_cfg.get("quantization", quant_cfg)
        # Check if quant config is specified and use kv cache quant algo
-        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
-            "kv_cache_quant_algo"
+        kv_algo = (
+            quantization_inner.get("kv_cache_scheme")
+            or quant_cfg.get("kv_cache_scheme")
+            or quantization_inner.get("kv_cache_quant_algo")
+            or quant_cfg.get("kv_cache_quant_algo")
        )
+        if isinstance(kv_algo, dict):
+            if (
+                kv_algo.get("dynamic") is False
+                and kv_algo.get("num_bits") == 8
+                and kv_algo.get("type") == "float"
+            ):
+                kv_algo = "fp8"
+            else:
+                # Unknown/unsupported format - return "auto" as safe fallback
+                logger.warning(
+                    "WARNING: Unknown kv_cache_quant_algo '%s' in model "
+                    "config. Supported values: %s. Falling back to 'auto'.",
+                    f"{kv_algo}",
+                    list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()),
+                )
+                return "auto"
        if isinstance(kv_algo, str):
            kv_algo_lower = kv_algo.lower()