feat:修改fp8 mqa接口&&跳过VLLM_USE_FUSED_FILL_RMS_CAT&&跳过load_error

3a306316 · liuchy5 · 8d371e97 · 3a306316
Commit 3a306316 authored Mar 11, 2026 by liuchy5
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

vllm/model_executor/layers/sparse_attn_indexer.py vllm/model_executor/layers/sparse_attn_indexer.py +4 -4

No files found.
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -124,15 +124,15 @@ def sparse_attn_indexer(
                )
                logits = op.mqa_logits(
                    q_fp8[chunk.token_start:chunk.token_end],  
-                    k, 
+                    k_fp8, 
-                    weights[chunk.token_start:chunk.token_end].to(torch.float32), 
+                    weights[chunk.token_start:chunk.token_end], 
                    chunk.cu_seqlen_ks, 
                    chunk.cu_seqlen_ke,
                    q_fp8[chunk.token_start:chunk.token_end].shape[0],
-                    k.shape[0],
+                    k_fp8.shape[0],
                    q_fp8.shape[1],
                    q_fp8.shape[2],
-                    k_scale,
+                    k_scale.view(torch.float32).flatten(),
                    True
                    )
            else: