update DeepseekV32IndexerCache

2cbda743 · zhuwenwen · 734f52d8 · 2cbda743
Commit 2cbda743 authored Jan 05, 2026 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +3 -3

No files found.
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -702,7 +702,7 @@ def sparse_attn_indexer(
        else:
            logits = gemmopt.paged_mqa_logits(
                padded_q_fp8_decode_tokens, 
-                kv_cache if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" else kv_cache.to(torch.bfloat16), 
+                kv_cache, 
                weights[:num_padded_tokens] if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" else weights[:num_padded_tokens].to(torch.float32), 
                decode_metadata.seq_lens, 
                decode_metadata.block_table, 
@@ -829,8 +829,8 @@ class Indexer(nn.Module):
        #       per self.quant_block_size element
        self.k_cache = DeepseekV32IndexerCache(
            head_dim=self.head_dim +
-            self.head_dim // self.quant_block_size * 4,
-            dtype=torch.uint8,
+            self.head_dim // self.quant_block_size * 4 if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" else self.head_dim,
+            dtype=torch.uint8 if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" else torch.bfloat16,
            prefix=f"{prefix}.k_cache",
            cache_config=cache_config)
        self.max_model_len = vllm_config.model_config.max_model_len