update kv_cache_dtype

a9c755ac · zhuwenwen · c0697921 · a9c755ac
Commit a9c755ac authored Jan 28, 2026 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/common.py +1 -1

No files found.
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1300,7 +1300,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        if has_decode:
            assert attn_metadata.decode is not None
-            if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" and kv_cache_dtype_str=="fp8_e4m3" and envs.VLLM_USE_FUSED_CACHE_QUANT_BMM_MLA:
+            if torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938" and self.kv_cache_dtype=="fp8_e4m3" and envs.VLLM_USE_FUSED_CACHE_QUANT_BMM_MLA:
                decode_q = q_quant[:num_decode_tokens]
            decode_q_nope, decode_q_pe = decode_q.split(
                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)