update q_quant dtype

8f30468c · zhuwenwen · c7b0d0d4 · 8f30468c
Commit 8f30468c authored Jan 17, 2026 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/common.py +1 -1

No files found.
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1253,7 +1253,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                        ) 
                    else:
                        q_tensor = torch.randn(q.shape[0], num_local_heads, self.qk_nope_head_dim + self.qk_rope_head_dim, dtype=q.dtype, device=q.device)
-                        q_quant = torch.empty_like(q_tensor, dtype=kv_cache_dtype_str, device=q.device)
+                        q_quant = torch.empty_like(q_tensor, dtype=torch.float8_e4m3fn, device=q.device)
                        q_scale = torch.empty(q.shape[0], dtype=torch.float32, device=q.device)
                        fuse_rmsnorm_rope_quant_qkv(
                            positions[:num_actual_toks, ...],