[Attention][V0 Deprecation] Deprecate accept output buffer (#39125)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>

[Attention][V0 Deprecation] Deprecate accept output buffer (#39125)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
70406eb1 · Lucas Wilkinson · GitHub · 08bfedc1 · 70406eb1 · 70406eb1
Unverified Commit 70406eb1 authored Apr 07, 2026 by Lucas Wilkinson Committed by GitHub Apr 07, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 8 deletions

vllm/v1/attention/backends/tree_attn.py vllm/v1/attention/backends/tree_attn.py +1 -4

vllm/v1/attention/backends/triton_attn.py vllm/v1/attention/backends/triton_attn.py +1 -4

No files found.
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -30,7 +30,6 @@ logger = init_logger(__name__)


 class TreeAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
        "auto",
@@ -368,7 +367,7 @@ class TreeAttentionImpl(AttentionImpl):
        value: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: TreeAttentionMetadata,
-        output: torch.Tensor | None = None,
+        output: torch.Tensor,
        output_scale: torch.Tensor | None = None,
        output_block_scale: torch.Tensor | None = None,
    ) -> torch.Tensor:
@@ -384,8 +383,6 @@ class TreeAttentionImpl(AttentionImpl):
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
-        assert output is not None, "Output tensor must be provided."
-
        if output_scale is not None or output_block_scale is not None:
            raise NotImplementedError(
                "fused output quantization is not yet supported for TreeAttentionImpl"

--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -262,7 +262,6 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet


 class TritonAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
    supported_dtypes: ClassVar[list[torch.dtype]] = [
        torch.float16,
        torch.bfloat16,
@@ -504,7 +503,7 @@ class TritonAttentionImpl(AttentionImpl):
        value: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: TritonAttentionMetadata,
-        output: torch.Tensor | None = None,
+        output: torch.Tensor,
        output_scale: torch.Tensor | None = None,
        output_block_scale: torch.Tensor | None = None,
    ) -> torch.Tensor:
@@ -520,8 +519,6 @@ class TritonAttentionImpl(AttentionImpl):
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
-        assert output is not None, "Output tensor must be provided."
-
        if output_block_scale is not None:
            raise NotImplementedError(
                "fused block_scale output quantization is not yet supported"