Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
40e6952a
Commit
40e6952a
authored
Oct 05, 2023
by
Casper Hansen
Browse files
Remove past_key_value (save 2GB VRAM)
parent
eccb8f9c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
2 additions
and
6 deletions
+2
-6
awq/modules/fused/attn.py
awq/modules/fused/attn.py
+2
-6
No files found.
awq/modules/fused/attn.py
View file @
40e6952a
...
...
@@ -206,7 +206,6 @@ class QuantAttentionFused(nn.Module):
keys
=
torch
.
repeat_interleave
(
keys
,
dim
=
2
,
repeats
=
self
.
n_kv_groups
)
values
=
torch
.
repeat_interleave
(
values
,
dim
=
2
,
repeats
=
self
.
n_kv_groups
)
past_key_value
=
(
xk
,
xv
)
if
use_cache
else
None
xq
=
xq
.
transpose
(
1
,
2
)
keys
=
keys
.
transpose
(
1
,
2
)
values
=
values
.
transpose
(
1
,
2
)
...
...
@@ -222,14 +221,10 @@ class QuantAttentionFused(nn.Module):
output
=
torch
.
matmul
(
scores
,
values
)
# (bs, n_local_heads, slen, head_dim)
attention_weight
=
output
.
transpose
(
1
,
2
).
contiguous
().
view
(
bsz
,
seqlen
,
-
1
)
else
:
# xq = xq[:, 0, :, :]
# xk = xk[:, 0, :, :]
# xv = xv[:, 0, :, :]
xq
=
xq
.
view
((
bsz
,)
+
self
.
attention_shapes
[
"single_xq_view"
])
xk
=
xk
.
view
((
bsz
,)
+
self
.
attention_shapes
[
"single_xk_view"
])
xv
=
xv
.
view
((
bsz
,)
+
self
.
attention_shapes
[
"single_xv_view"
])
past_key_value
=
(
xk
,
xv
)
if
use_cache
else
None
attention_weight
=
ft_inference_engine
.
single_query_attention
(
xq
,
# query
xk
,
# key
...
...
@@ -252,4 +247,5 @@ class QuantAttentionFused(nn.Module):
else
:
self
.
start_pos
=
0
return
attn_output
,
attention_weight
,
past_key_value
# past_key_value is replaced with cache_v, cache_k, returning None
return
attn_output
,
attention_weight
,
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment