[Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980)

Signed-off-by: Daniel Afrimi <dafrimi@nvidia.com>

[Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980)
Signed-off-by: Daniel Afrimi <dafrimi@nvidia.com>
6ec0d8db · danielafrimi · GitHub · 9693dd0f · 6ec0d8db
Unverified Commit 6ec0d8db authored Dec 12, 2025 by danielafrimi Committed by GitHub Dec 12, 2025
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 2 deletions

vllm/utils/torch_utils.py vllm/utils/torch_utils.py +23 -2

No files found.
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -194,12 +194,33 @@ def get_kv_cache_torch_dtype(
    return torch_dtype
+def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
+    quant_method = quant_cfg.get("quant_method", "")
+    if quant_method.startswith("modelopt"):
+        quantization_inner = quant_cfg.get("quantization", quant_cfg)
+        # Check if quant config is specified and use kv cache quant algo
+        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
+            "kv_cache_quant_algo"
+        )
+        if isinstance(kv_algo, str):
+            return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()]
+    return None
 def kv_cache_dtype_str_to_dtype(
    kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:
-    if kv_cache_dtype == "auto":
    # Model config may not be specified for unit tests, default to float16
-        return model_config.dtype if model_config else torch.half
+    dtype = model_config.dtype if model_config else torch.half
+    if kv_cache_dtype == "auto":
+        hf_cfg = getattr(model_config, "hf_config", None)
+        if hf_cfg is not None:
+            quant_cfg = getattr(hf_cfg, "quantization_config", None)
+            if quant_cfg is not None:
+                kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg)
+                return kv_algo_dtype if kv_algo_dtype is not None else dtype
+        return dtype
    return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]