Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2e0e0176
Unverified
Commit
2e0e0176
authored
Jan 14, 2025
by
wangxiyuan
Committed by
GitHub
Jan 14, 2025
Browse files
[Platform] Add output for Attention Backend (#11981)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
1f18adb2
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
9 additions
and
5 deletions
+9
-5
vllm/attention/backends/abstract.py
vllm/attention/backends/abstract.py
+4
-0
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+2
-0
vllm/attention/layer.py
vllm/attention/layer.py
+1
-5
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+2
-0
No files found.
vllm/attention/backends/abstract.py
View file @
2e0e0176
...
...
@@ -31,6 +31,10 @@ class AttentionType:
class
AttentionBackend
(
ABC
):
"""Abstract class for attention backends."""
# For some attention backends, we allocate an output tensor before
# calling the custom op. When piecewise cudagraph is enabled, this
# makes sure the output tensor is allocated inside the cudagraph.
accept_output_buffer
:
bool
=
False
@
staticmethod
@
abstractmethod
...
...
vllm/attention/backends/flash_attn.py
View file @
2e0e0176
...
...
@@ -29,6 +29,8 @@ from vllm.vllm_flash_attn import (flash_attn_varlen_func,
class
FlashAttentionBackend
(
AttentionBackend
):
accept_output_buffer
:
bool
=
True
@
staticmethod
def
get_supported_head_sizes
()
->
List
[
int
]:
return
[
32
,
64
,
96
,
128
,
160
,
192
,
224
,
256
]
...
...
vllm/attention/layer.py
View file @
2e0e0176
...
...
@@ -110,11 +110,7 @@ class Attention(nn.Module):
self
.
use_direct_call
=
not
current_platform
.
is_cuda_alike
(
)
and
not
current_platform
.
is_cpu
()
# For some attention backends, we allocate an output tensor before
# calling the custom op. When piecewise cudagraph is enabled, this
# makes sure the output tensor is allocated inside the cudagraph.
self
.
use_output
=
self
.
backend
==
_Backend
.
FLASH_ATTN
or
\
self
.
backend
==
_Backend
.
FLASH_ATTN_VLLM_V1
self
.
use_output
=
attn_backend
.
accept_output_buffer
compilation_config
=
get_current_vllm_config
().
compilation_config
if
prefix
in
compilation_config
.
static_forward_context
:
raise
ValueError
(
f
"Duplicate layer name:
{
prefix
}
"
)
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
2e0e0176
...
...
@@ -15,6 +15,8 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func
class
FlashAttentionBackend
(
AttentionBackend
):
accept_output_buffer
:
bool
=
True
@
staticmethod
def
get_supported_head_sizes
()
->
List
[
int
]:
return
[
32
,
64
,
96
,
128
,
160
,
192
,
224
,
256
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment