Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
66b2e233
Commit
66b2e233
authored
Oct 06, 2023
by
Casper Hansen
Browse files
Create cache for fused modules
parent
c9e45270
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
45 additions
and
0 deletions
+45
-0
awq/modules/fused/cache.py
awq/modules/fused/cache.py
+45
-0
No files found.
awq/modules/fused/cache.py
0 → 100644
View file @
66b2e233
import
torch
class
WindowedCache
:
def
__init__
(
self
,
cache_v_shape
,
cache_k_shape
,
device
,
attention_sinks
=
4
):
"""
The window size is the same as the max_new_tokens. The window will
automatically roll once max_new_tokens is exceeded.
"""
self
.
attention_sinks
=
attention_sinks
# [batch_size, n_kv_heads, max_seq_len, head_dim]
self
.
v
=
torch
.
zeros
(
cache_v_shape
).
to
(
device
).
half
()
# [batch_size, n_kv_heads, head_dim // pack_factor, max_seq_len, pack_factor]
self
.
k
=
torch
.
zeros
(
cache_k_shape
).
to
(
device
).
half
()
def
get_kv
(
self
,
batch_size
,
start_pos
,
seqlen
,
head_dim
):
xv
=
self
.
v
[:
batch_size
,
:,
:
start_pos
+
seqlen
,
:].
transpose
(
1
,
2
).
contiguous
()
xk
=
self
.
k
[:
batch_size
,
:,
:,
:
start_pos
+
seqlen
,
:].
transpose
(
2
,
3
).
contiguous
()
xk
=
xk
.
reshape
(
xk
.
shape
[:
-
2
]
+
(
head_dim
,)).
transpose
(
1
,
2
).
contiguous
()
return
xv
,
xk
def
update_kv
(
self
,
values_store
,
keys_store
,
batch_size
,
start_pos
,
seqlen
):
self
.
v
[:
batch_size
,
:,
start_pos
:
start_pos
+
seqlen
,
:]
=
values_store
self
.
k
[:
batch_size
,
:,
:,
start_pos
:
start_pos
+
seqlen
,
:]
=
keys_store
def
roll_kv
(
self
,
roll_len
,
start_pos
):
"""
For example, with roll_len=3 and [A,B,C,D,E] we get [D,E,F,G,H]
With sink=1, roll_len=3, and [A,B,C,D,E] we get [A,E,F,G,H]
"""
# Roll only the necessary part of the cache to the left
self
.
v
[:,
:,
self
.
attention_sinks
:
-
roll_len
+
self
.
attention_sinks
,
:]
=
self
.
v
[:,
:,
roll_len
:,
:]
self
.
k
[:,
:,
:,
self
.
attention_sinks
:
-
roll_len
+
self
.
attention_sinks
,
:]
=
self
.
k
[:,
:,
:,
roll_len
:,
:]
# Zero out the new part
self
.
v
[:,
:,
-
roll_len
:,
:]
=
0
self
.
k
[:,
:,
:,
-
roll_len
:,
:]
=
0
return
start_pos
-
roll_len
def
to
(
self
,
device
):
self
.
k
=
self
.
k
.
to
(
device
)
self
.
v
=
self
.
v
.
to
(
device
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment