[FIX]减少mqa_logits显存占用

c462f3a0 · wanghl6 · 3c0e74be · c462f3a0
Commit c462f3a0 authored Apr 17, 2026 by wanghl6
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 7 deletions

vllm/model_executor/layers/sparse_attn_indexer.py vllm/model_executor/layers/sparse_attn_indexer.py +1 -7

No files found.
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -51,13 +51,7 @@ def sparse_attn_indexer(
    # careful! this will be None in dummy run
    attn_metadata = get_forward_context().attn_metadata
    fp8_dtype = current_platform.fp8_dtype()
-    if q_fp8.dtype == fp8_dtype:
+    MAX_ELEMENTS = 16384 * 16384 
-        MAX_ELEMENTS = 65536 * 65536
-    elif q_fp8.dtype in (torch.bfloat16, torch.float16):
-        MAX_ELEMENTS = 16384 * 32768
-    else:
-        MAX_ELEMENTS = 16384 * 32768 
    device = q_fp8.device
    if device not in _GLOBAL_LOGITS_BUFFERS or _GLOBAL_LOGITS_BUFFERS[device].numel() < MAX_ELEMENTS:
        _GLOBAL_LOGITS_BUFFERS[device] = torch.empty(