"docs/pages/components/vscode:/vscode.git/clone" did not exist on "5c64ffc3c8679c6d3a4065dfb71bb6b16f39496f"
Unverified Commit 70406eb1 authored by Lucas Wilkinson's avatar Lucas Wilkinson Committed by GitHub
Browse files

[Attention][V0 Deprecation] Deprecate accept output buffer (#39125)


Signed-off-by: default avatarLucas Wilkinson <lwilkins@redhat.com>
parent 08bfedc1
...@@ -30,7 +30,6 @@ logger = init_logger(__name__) ...@@ -30,7 +30,6 @@ logger = init_logger(__name__)
class TreeAttentionBackend(AttentionBackend): class TreeAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
"auto", "auto",
...@@ -368,7 +367,7 @@ class TreeAttentionImpl(AttentionImpl): ...@@ -368,7 +367,7 @@ class TreeAttentionImpl(AttentionImpl):
value: torch.Tensor, value: torch.Tensor,
kv_cache: torch.Tensor, kv_cache: torch.Tensor,
attn_metadata: TreeAttentionMetadata, attn_metadata: TreeAttentionMetadata,
output: torch.Tensor | None = None, output: torch.Tensor,
output_scale: torch.Tensor | None = None, output_scale: torch.Tensor | None = None,
output_block_scale: torch.Tensor | None = None, output_block_scale: torch.Tensor | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
...@@ -384,8 +383,6 @@ class TreeAttentionImpl(AttentionImpl): ...@@ -384,8 +383,6 @@ class TreeAttentionImpl(AttentionImpl):
Returns: Returns:
shape = [num_tokens, num_heads * head_size] shape = [num_tokens, num_heads * head_size]
""" """
assert output is not None, "Output tensor must be provided."
if output_scale is not None or output_block_scale is not None: if output_scale is not None or output_block_scale is not None:
raise NotImplementedError( raise NotImplementedError(
"fused output quantization is not yet supported for TreeAttentionImpl" "fused output quantization is not yet supported for TreeAttentionImpl"
......
...@@ -262,7 +262,6 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet ...@@ -262,7 +262,6 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
class TritonAttentionBackend(AttentionBackend): class TritonAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
supported_dtypes: ClassVar[list[torch.dtype]] = [ supported_dtypes: ClassVar[list[torch.dtype]] = [
torch.float16, torch.float16,
torch.bfloat16, torch.bfloat16,
...@@ -504,7 +503,7 @@ class TritonAttentionImpl(AttentionImpl): ...@@ -504,7 +503,7 @@ class TritonAttentionImpl(AttentionImpl):
value: torch.Tensor, value: torch.Tensor,
kv_cache: torch.Tensor, kv_cache: torch.Tensor,
attn_metadata: TritonAttentionMetadata, attn_metadata: TritonAttentionMetadata,
output: torch.Tensor | None = None, output: torch.Tensor,
output_scale: torch.Tensor | None = None, output_scale: torch.Tensor | None = None,
output_block_scale: torch.Tensor | None = None, output_block_scale: torch.Tensor | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
...@@ -520,8 +519,6 @@ class TritonAttentionImpl(AttentionImpl): ...@@ -520,8 +519,6 @@ class TritonAttentionImpl(AttentionImpl):
Returns: Returns:
shape = [num_tokens, num_heads * head_size] shape = [num_tokens, num_heads * head_size]
""" """
assert output is not None, "Output tensor must be provided."
if output_block_scale is not None: if output_block_scale is not None:
raise NotImplementedError( raise NotImplementedError(
"fused block_scale output quantization is not yet supported" "fused block_scale output quantization is not yet supported"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment