[Bugfix] Disable --calculate-kv-scales for hybrid GDN/Mamba+Attention… (#37565)

Signed-off-by: Young-Leo <562593859@qq.com>

[Bugfix] Disable --calculate-kv-scales for hybrid GDN/Mamba+Attention… (#37565)
Signed-off-by: Young-Leo <562593859@qq.com>
d7d2b5e4 · Le Yang · GitHub · 6ec5e9fd · d7d2b5e4
Unverified Commit d7d2b5e4 authored Mar 21, 2026 by Le Yang Committed by GitHub Mar 20, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 1 deletion

vllm/model_executor/models/config.py vllm/model_executor/models/config.py +17 -1

No files found.
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -113,8 +113,24 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
        Args:
            vllm_config: vLLM Config
        """
+        cache_config = vllm_config.cache_config
+
+        # Disable calculate_kv_scales for hybrid models: uninitialized
+        # recurrent state corrupts scales during the calibration pass.
+        # See issue: https://github.com/vllm-project/vllm/issues/37554
+        if cache_config.calculate_kv_scales:
+            logger.warning(
+                "Disabling calculate_kv_scales for hybrid model '%s'. "
+                "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
+                "produce unreliable KV cache scales during the "
+                "calibration pass because recurrent state is "
+                "uninitialized. Using default scale of 1.0 instead.",
+                vllm_config.model_config.model,
+            )
+            cache_config.calculate_kv_scales = False
+
        # Save the user input before it gets modified by MambaModelConfig
-        mamba_block_size = vllm_config.cache_config.mamba_block_size
+        mamba_block_size = cache_config.mamba_block_size
        # Enable FULL_AND_PIECEWISE by default
        MambaModelConfig.verify_and_update_config(vllm_config)