Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
75471386
Unverified
Commit
75471386
authored
Aug 30, 2023
by
Aman Gupta Karmani
Committed by
GitHub
Aug 29, 2023
Browse files
use flash-attn via xformers (#877)
parent
d2b2eed6
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
5 deletions
+0
-5
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+0
-2
vllm/model_executor/layers/attention.py
vllm/model_executor/layers/attention.py
+0
-3
No files found.
tests/kernels/test_attention.py
View file @
75471386
...
@@ -266,7 +266,6 @@ def run_multi_query_kv_attention(
...
@@ -266,7 +266,6 @@ def run_multi_query_kv_attention(
qkv
.
uniform_
(
-
1e-3
,
1e-3
)
qkv
.
uniform_
(
-
1e-3
,
1e-3
)
query
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
query
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
attn_op
=
xops
.
fmha
.
cutlass
.
FwOp
()
attn_bias
=
BlockDiagonalCausalMask
.
from_seqlens
(
seq_lens
)
attn_bias
=
BlockDiagonalCausalMask
.
from_seqlens
(
seq_lens
)
output
=
xops
.
memory_efficient_attention_forward
(
output
=
xops
.
memory_efficient_attention_forward
(
query
.
unsqueeze
(
0
),
query
.
unsqueeze
(
0
),
...
@@ -275,7 +274,6 @@ def run_multi_query_kv_attention(
...
@@ -275,7 +274,6 @@ def run_multi_query_kv_attention(
attn_bias
=
attn_bias
,
attn_bias
=
attn_bias
,
p
=
0.0
,
p
=
0.0
,
scale
=
scale
,
scale
=
scale
,
op
=
attn_op
,
)
)
output
=
output
.
squeeze
(
0
)
output
=
output
.
squeeze
(
0
)
...
...
vllm/model_executor/layers/attention.py
View file @
75471386
...
@@ -61,7 +61,6 @@ class PagedAttention(nn.Module):
...
@@ -61,7 +61,6 @@ class PagedAttention(nn.Module):
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
self
.
attn_op
=
xops
.
fmha
.
cutlass
.
FwOp
()
self
.
num_kv_heads
=
num_heads
if
num_kv_heads
is
None
else
num_kv_heads
self
.
num_kv_heads
=
num_heads
if
num_kv_heads
is
None
else
num_kv_heads
assert
self
.
num_heads
%
self
.
num_kv_heads
==
0
assert
self
.
num_heads
%
self
.
num_kv_heads
==
0
...
@@ -115,7 +114,6 @@ class PagedAttention(nn.Module):
...
@@ -115,7 +114,6 @@ class PagedAttention(nn.Module):
attn_bias
=
input_metadata
.
attn_bias
[
0
],
attn_bias
=
input_metadata
.
attn_bias
[
0
],
p
=
0.0
,
p
=
0.0
,
scale
=
self
.
scale
,
scale
=
self
.
scale
,
op
=
self
.
attn_op
,
)
)
# TODO(woosuk): Unnecessary copy. Optimize.
# TODO(woosuk): Unnecessary copy. Optimize.
output
.
copy_
(
out
.
squeeze
(
0
))
output
.
copy_
(
out
.
squeeze
(
0
))
...
@@ -404,7 +402,6 @@ class PagedAttentionWithALiBi(PagedAttention):
...
@@ -404,7 +402,6 @@ class PagedAttentionWithALiBi(PagedAttention):
attn_bias
=
input_metadata
.
attn_bias
[
i
],
attn_bias
=
input_metadata
.
attn_bias
[
i
],
p
=
0.0
,
p
=
0.0
,
scale
=
self
.
scale
,
scale
=
self
.
scale
,
op
=
self
.
attn_op
,
)
)
# TODO(woosuk): Unnecessary copy. Optimize.
# TODO(woosuk): Unnecessary copy. Optimize.
output
[
start
:
end
].
copy_
(
out
.
squeeze
(
0
))
output
[
start
:
end
].
copy_
(
out
.
squeeze
(
0
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment