[Kernel] [V1] Fix performance regression for triton unified attention (#18161)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>

[Kernel] [V1] Fix performance regression for triton unified attention (#18161)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
01c22335 · Thomas Parnell · GitHub · 451da4bc · 01c22335 · 01c22335
Unverified Commit 01c22335 authored May 15, 2025 by Thomas Parnell Committed by GitHub May 15, 2025
Showing with 18 additions and 5 deletions

vllm/attention/ops/triton_unified_attention.py vllm/attention/ops/triton_unified_attention.py +2 -2

vllm/v1/attention/backends/triton_attn.py vllm/v1/attention/backends/triton_attn.py +16 -3

No files found.
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -56,11 +56,11 @@ def kernel_unified_attention_2d(
    stride_k_cache_0: tl.int64,  # int
    stride_k_cache_1: tl.int64,  # int
    stride_k_cache_2: tl.int64,  # int
-    stride_k_cache_3: tl.int64,  # int
+    stride_k_cache_3: tl.constexpr,  # int
    stride_v_cache_0: tl.int64,  # int
    stride_v_cache_1: tl.int64,  # int
    stride_v_cache_2: tl.int64,  # int
-    stride_v_cache_3: tl.int64,  # int
+    stride_v_cache_3: tl.constexpr,  # int
    query_start_len_ptr,  # [num_seqs+1]
    BLOCK_Q: tl.constexpr,  # int
    num_seqs: tl.int32,

--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer with PagedAttention and Triton prefix prefill."""
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 import torch
@@ -12,10 +12,23 @@ from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import (
    FlashAttentionMetadata, FlashAttentionMetadataBuilder)
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+if TYPE_CHECKING:
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 logger = init_logger(__name__)
+class TritonAttentionMetadataBuilder(FlashAttentionMetadataBuilder):
+    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable):
+        super().__init__(runner, kv_cache_spec, block_table)
+        self.aot_schedule = False
 class TritonAttentionBackend(AttentionBackend):
    accept_output_buffer: bool = True
@@ -52,8 +65,8 @@ class TritonAttentionBackend(AttentionBackend):
        return False
    @staticmethod
-    def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
+    def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
-        return FlashAttentionMetadataBuilder
+        return TritonAttentionMetadataBuilder
 class TritonAttentionImpl(AttentionImpl):