Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
aa78aeaa
You need to sign in or sign up before continuing.
Commit
aa78aeaa
authored
Feb 09, 2023
by
Woosuk Kwon
Browse files
Add block manager
parent
3c2b47fc
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
204 additions
and
0 deletions
+204
-0
cacheflow/master/block_manager.py
cacheflow/master/block_manager.py
+204
-0
No files found.
cacheflow/master/block_manager.py
0 → 100644
View file @
aa78aeaa
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
from
cacheflow.block
import
PhysicalTokenBlock
from
cacheflow.sequence
import
Sequence
from
cacheflow.sequence
import
SequenceGroup
from
cacheflow.sequence
import
SequenceStatus
from
cacheflow.utils
import
Device
class
BlockAllocator
:
def
__init__
(
self
,
device
:
Device
,
block_size
:
int
,
num_blocks
:
int
,
)
->
None
:
assert
block_size
in
[
8
,
16
,
32
]
self
.
device
=
device
self
.
block_size
=
block_size
self
.
num_blocks
=
num_blocks
# Initialize the free blocks.
# TODO(woosuk): Make this a priority queue.
self
.
free_blocks
=
[
PhysicalTokenBlock
(
device
=
device
,
block_number
=
i
,
block_size
=
block_size
)
for
i
in
range
(
num_blocks
)
]
def
allocate
(
self
)
->
PhysicalTokenBlock
:
if
not
self
.
free_blocks
:
raise
ValueError
(
'Out of memory! '
f
'No more free blocks are available.'
)
block
=
self
.
free_blocks
.
pop
()
block
.
ref_count
=
1
return
block
def
free
(
self
,
block
:
PhysicalTokenBlock
)
->
None
:
if
block
.
ref_count
==
0
:
raise
ValueError
(
'Double free! '
f
'The block
{
block
}
is already freed.'
)
block
.
ref_count
-=
1
if
block
.
ref_count
==
0
:
self
.
free_blocks
.
append
(
block
)
def
get_num_free_blocks
(
self
)
->
int
:
return
len
(
self
.
free_blocks
)
# Mapping: logical block number -> physical block.
BlockTable
=
List
[
PhysicalTokenBlock
]
class
BlockSpaceManager
:
def
__init__
(
self
,
block_size
:
int
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
block_size
=
block_size
self
.
num_total_gpu_blocks
=
num_gpu_blocks
self
.
num_total_cpu_blocks
=
num_cpu_blocks
self
.
gpu_allocator
=
BlockAllocator
(
Device
.
GPU
,
block_size
,
num_gpu_blocks
)
self
.
cpu_allocator
=
BlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Mapping: seq_id -> BlockTable.
self
.
block_tables
:
Dict
[
int
,
BlockTable
]
=
{}
def
can_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
seq
=
seq_group
.
seqs
[
0
]
num_required_blocks
=
len
(
seq
.
logical_token_blocks
)
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
return
num_required_blocks
<=
num_free_gpu_blocks
def
allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
# Here, we assume that all sequences in the group have the same prompt.
seq
=
seq_group
.
seqs
[
0
]
# Allocate new physical token blocks that will store the prompt tokens.
block_table
:
BlockTable
=
[]
for
_
in
range
(
len
(
seq
.
logical_token_blocks
)):
block
=
self
.
gpu_allocator
.
allocate
()
# Set the reference counts of the token blocks.
block
.
ref_count
=
seq_group
.
num_seqs
()
block_table
.
append
(
block
)
# Assign the block table for each sequence.
for
seq
in
seq_group
.
seqs
:
self
.
block_tables
[
seq
.
seq_id
]
=
block_table
.
copy
()
def
can_append
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
# Simple heuristic: If there is at least one free block
# for each sequence, we can append.
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
num_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
SERVING
)
return
num_seqs
<=
num_free_gpu_blocks
def
append
(
self
,
seq
:
Sequence
)
->
Optional
[
Tuple
[
int
,
int
]]:
"""Allocate a physical slot for the new token."""
logical_blocks
=
seq
.
logical_token_blocks
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
if
len
(
block_table
)
<
len
(
logical_blocks
):
# The sequence has a new logical block.
# Allocate a new physical block.
block
=
self
.
gpu_allocator
.
allocate
()
block_table
.
append
(
block
)
return
None
# We want to append the token to the last physical block.
last_block
=
block_table
[
-
1
]
assert
last_block
.
device
==
Device
.
GPU
if
last_block
.
ref_count
==
1
:
# Append.
return
None
else
:
# The last block is shared with other sequences.
# Copy on Write: Allocate a new block and copy the tokens.
block
=
self
.
gpu_allocator
.
allocate
()
block_table
.
append
(
block
)
self
.
gpu_allocator
.
free
(
last_block
)
return
last_block
.
block_number
,
block
.
block_number
def
fork
(
self
,
src_seq
:
Sequence
,
child_seq
:
Sequence
)
->
None
:
# NOTE: fork does not allocate a new physical block.
# Thus, it is always safe from OOM.
src_block_table
=
self
.
block_tables
[
src_seq
.
seq_id
]
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
copy
()
for
block
in
src_block_table
:
block
.
ref_count
+=
1
def
_get_physical_blocks
(
self
,
seq_group
:
SequenceGroup
)
->
List
[
PhysicalTokenBlock
]:
# NOTE: Here, we assume that the physical blocks are only shared by
# the sequences in the same group.
blocks
:
Set
[
PhysicalTokenBlock
]
=
set
()
for
seq
in
seq_group
.
seqs
:
if
seq
.
status
==
SequenceStatus
.
FINISHED
:
continue
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
for
block
in
block_table
:
blocks
.
add
(
block
)
return
list
(
blocks
)
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
blocks
=
self
.
_get_physical_blocks
(
seq_group
)
return
len
(
blocks
)
<=
self
.
gpu_allocator
.
get_num_free_blocks
()
def
swap_in
(
self
,
seq_group
:
SequenceGroup
)
->
Dict
[
int
,
int
]:
# src_block_number -> dst_block_number
mapping
:
Dict
[
int
,
int
]
=
{}
for
seq
in
seq_group
.
seqs
:
if
seq
.
status
==
SequenceStatus
.
FINISHED
:
continue
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
for
cpu_block
in
block_table
:
if
cpu_block
in
mapping
:
continue
gpu_block
=
self
.
gpu_allocator
.
allocate
()
mapping
[
cpu_block
.
block_number
]
=
gpu_block
.
block_number
# Free the CPU block swapped in to GPU.
self
.
cpu_allocator
.
free
(
cpu_block
)
return
mapping
def
can_swap_out
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
blocks
=
self
.
_get_physical_blocks
(
seq_group
)
return
len
(
blocks
)
<=
self
.
cpu_allocator
.
get_num_free_blocks
()
def
swap_out
(
self
,
seq_group
:
SequenceGroup
)
->
Dict
[
int
,
int
]:
# src_block_number -> dst_block_number
mapping
:
Dict
[
int
,
int
]
=
{}
for
seq
in
seq_group
.
seqs
:
if
seq
.
status
==
SequenceStatus
.
FINISHED
:
continue
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
for
gpu_block
in
block_table
:
if
gpu_block
.
block_number
in
mapping
:
continue
cpu_block
=
self
.
cpu_allocator
.
allocate
()
mapping
[
gpu_block
.
block_number
]
=
cpu_block
.
block_number
# Free the GPU block swapped out to CPU.
self
.
gpu_allocator
.
free
(
gpu_block
)
return
mapping
def
_free_blocks
(
self
,
blocks
:
Iterable
[
PhysicalTokenBlock
])
->
None
:
for
block
in
blocks
:
if
block
.
device
==
Device
.
GPU
:
self
.
gpu_allocator
.
free
(
block
)
else
:
self
.
cpu_allocator
.
free
(
block
)
def
free
(
self
,
seq
:
Sequence
)
->
None
:
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
self
.
_free_blocks
(
block_table
)
del
self
.
block_tables
[
seq
.
seq_id
]
def
reset
(
self
)
->
None
:
for
block_table
in
self
.
block_tables
.
values
():
self
.
_free_blocks
(
block_table
)
self
.
block_tables
.
clear
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment