[Bugfix] Fix extract_hidden_states crash with quantized KV cache dtype (#39160)

Signed-off-by: Yubo Wang <yubowang2019@gmail.com>

[Bugfix] Fix extract_hidden_states crash with quantized KV cache dtype (#39160)
Signed-off-by: Yubo Wang <yubowang2019@gmail.com>
08bfedc1 · Yubo Wang · GitHub · 0102bd2f · 08bfedc1
Unverified Commit 08bfedc1 authored Apr 07, 2026 by Yubo Wang Committed by GitHub Apr 07, 2026
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

vllm/model_executor/models/extract_hidden_states.py vllm/model_executor/models/extract_hidden_states.py +5 -0

No files found.
--- a/vllm/model_executor/models/extract_hidden_states.py
+++ b/vllm/model_executor/models/extract_hidden_states.py
@@ -9,6 +9,7 @@ extract_hidden_states speculative decoding method.
 """

 from collections.abc import Iterable
+from dataclasses import replace
 from typing import ClassVar

 import torch
@@ -352,6 +353,10 @@ class ExtractHiddenStatesModel(nn.Module):

        cache_config = vllm_config.cache_config

+        # Hidden states dtype should be independent of KV cache dtype.
+        if cache_config is not None and is_quantized_kv_cache(cache_config.cache_dtype):
+            cache_config = replace(cache_config, cache_dtype="auto")
+
        # Create a single cache-only attention layer
        # Note: We set num_heads <- self.num_hidden_states
        # and head_size <- hidden_size so that we can insert