update k_cache

31a3beb5 · zhuwenwen · ee93cb70 · 31a3beb5 · 31a3beb5
Commit 31a3beb5 authored Dec 19, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/attention/backends/flashmla.py vllm/attention/backends/flashmla.py +1 -1

vllm/v1/attention/backends/mla/flashmla.py vllm/v1/attention/backends/mla/flashmla.py +1 -1

No files found.
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -266,7 +266,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
        if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" and kv_cache_dtype == "fp8_e4m3" and envs.VLLM_USE_FLASH_MLA_FP8:
            o, _ = flash_mla_with_kvcache_fp8(
                q=q.to(torch.float8_e4m3fn),
-                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2).to(torch.float8_e4m3fn),  # Add head dim of 1
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2).view(torch.float8_e4m3fn),  # Add head dim of 1
                block_table=decode_meta.block_tables,
                cache_seqlens=decode_meta.seq_lens_tensor,
                head_dim_v=self.kv_lora_rank,

--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -194,7 +194,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                    .unsqueeze(1) # Add seqlen dim of 1 (decode)
            o, _ = flash_mla_with_kvcache_fp8(
                q=q.to(torch.float8_e4m3fn),
-                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2).to(torch.float8_e4m3fn),  # Add head dim of 1
+                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2).view(torch.float8_e4m3fn),  # Add head dim of 1
                block_table=attn_metadata.decode.block_table,
                cache_seqlens=attn_metadata.decode.seq_lens,
                head_dim_v=self.kv_lora_rank,