[Bugfix] Remove `tile_size=64` for mm_prefix triton attention (#30973)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> (cherry picked from commit d2dc5dfc)

[Bugfix] Remove `tile_size=64` for mm_prefix triton attention (#30973)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> (cherry picked from commit d2dc5dfc)
b2eb84de · Isotr0py · Kevin H. Luu · ac43367c · b2eb84de
Commit b2eb84de authored Dec 19, 2025 by Isotr0py Committed by Kevin H. Luu Dec 18, 2025
Show whitespace changes
Inline Side-by-side

Showing with 0 additions and 7 deletions

vllm/attention/ops/triton_unified_attention.py vllm/attention/ops/triton_unified_attention.py +0 -7

No files found.
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -800,7 +800,6 @@ def _get_tile_size(
    head_size: int,
    sliding_window: int,
    element_size: int,
-    is_mm_prefix: bool,
    is_prefill: bool,
 ) -> int:
    """Select tile size with Gemma3-specific optimization.
@@ -809,10 +808,6 @@ def _get_tile_size(
    the larger head dimension (128/256). For other models, use
    the default vLLM behavior.
    """
-    if is_mm_prefix:
-        # Multimodal bidirectional attention needs a larger tile size
-        return 64
-
    if _is_gemma3_attention(head_size, sliding_window):
        # Gemma3: use 32 for decode (default is 16)
        return 32
@@ -903,14 +898,12 @@ def unified_attention(
        head_size,
        sliding_window_val,
        q.element_size(),
-        is_mm_prefix=use_mm_prefix,
        is_prefill=True,
    )
    TILE_SIZE_DECODE = _get_tile_size(
        head_size,
        sliding_window_val,
        q.element_size(),
-        is_mm_prefix=use_mm_prefix,
        is_prefill=False,
    )