Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
e7bee2aa
"git@developer.sourcefind.cn:OpenDAS/torch-harmonics.git" did not exist on "e4879676d187bae8452aae7aae3e54f3de1ad8e3"
Commit
e7bee2aa
authored
Feb 09, 2023
by
Woosuk Kwon
Browse files
Add cache engine
parent
aa78aeaa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
109 additions
and
0 deletions
+109
-0
cacheflow/worker/cache_engine.py
cacheflow/worker/cache_engine.py
+109
-0
No files found.
cacheflow/worker/cache_engine.py
0 → 100644
View file @
e7bee2aa
from
typing
import
List
,
Tuple
import
torch
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
CacheEngine
:
def
__init__
(
self
,
worker_id
:
int
,
gpu_id
:
int
,
num_layers
:
int
,
num_heads
:
int
,
head_size
:
int
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
block_size
:
int
,
dtype
:
torch
.
dtype
=
torch
.
float16
,
)
->
None
:
self
.
worker_id
=
worker_id
self
.
gpu_id
=
gpu_id
self
.
num_layers
=
num_layers
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
block_size
=
block_size
self
.
dtype
=
dtype
# Initialize the cache.
self
.
gpu_cache
=
self
.
allocate_gpu_cache
()
self
.
cpu_cache
=
self
.
allocate_cpu_cache
()
# Initialize the streams.
self
.
copy_stream
=
torch
.
cuda
.
Stream
(
device
=
gpu_id
)
self
.
swap_stream
=
torch
.
cuda
.
Stream
(
device
=
gpu_id
)
assert
self
.
copy_stream
!=
self
.
swap_stream
current_stream
=
torch
.
cuda
.
current_stream
(
device
=
gpu_id
)
assert
self
.
copy_stream
!=
current_stream
assert
self
.
swap_stream
!=
current_stream
# Initialize the events for synchronization.
def
allocate_gpu_cache
(
self
)
->
List
[
List
[
KVCache
]]:
gpu_cache
:
List
[
List
[
KVCache
]]
=
[]
for
_
in
range
(
self
.
num_layers
):
layer_cache
:
List
[
KVCache
]
=
[]
for
_
in
range
(
self
.
num_heads
):
key_blocks
=
torch
.
empty
(
(
self
.
num_gpu_blocks
,
self
.
block_size
*
self
.
head_size
),
dtype
=
self
.
dtype
,
device
=
self
.
gpu_id
,
)
value_blocks
=
torch
.
empty
(
(
self
.
num_gpu_blocks
,
self
.
block_size
*
self
.
head_size
),
dtype
=
self
.
dtype
,
device
=
self
.
gpu_id
,
)
layer_cache
.
append
((
key_blocks
,
value_blocks
))
gpu_cache
.
append
(
layer_cache
)
return
gpu_cache
def
allocate_cpu_cache
(
self
)
->
List
[
List
[
KVCache
]]:
cpu_cache
:
List
[
List
[
KVCache
]]
=
[]
for
_
in
range
(
self
.
num_layers
):
layer_cache
:
List
[
KVCache
]
=
[]
for
_
in
range
(
self
.
num_heads
):
key_blocks
=
torch
.
empty
(
(
self
.
num_cpu_blocks
,
self
.
block_size
*
self
.
head_size
),
dtype
=
self
.
dtype
,
pin_memory
=
True
,
)
value_blocks
=
torch
.
empty
(
(
self
.
num_cpu_blocks
,
self
.
block_size
*
self
.
head_size
),
dtype
=
self
.
dtype
,
pin_memory
=
True
,
)
layer_cache
.
append
((
key_blocks
,
value_blocks
))
cpu_cache
.
append
(
layer_cache
)
return
cpu_cache
def
copy
(
self
,
src_block_numbers
:
List
[
int
],
dst_block_numbers
:
List
[
int
],
)
->
None
:
for
layer
in
range
(
self
.
num_layers
):
# TODO: Call the COPY op.
pass
def
swap_out
(
self
,
gpu_block_numbers
:
List
[
int
],
cpu_block_numbers
:
List
[
int
],
)
->
None
:
for
layer
in
range
(
self
.
num_layers
):
# TODO: Call the SWAP_OUT op on the swap stream.
pass
def
swap_in
(
self
,
gpu_block_numbers
:
List
[
int
],
cpu_block_numbers
:
List
[
int
],
)
->
None
:
for
layer
in
range
(
self
.
num_layers
):
# TODO: Call the SWAP_IN op on the swap stream.
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment