Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
49a3c866
Unverified
Commit
49a3c866
authored
Mar 13, 2024
by
Breno Faria
Committed by
GitHub
Mar 13, 2024
Browse files
Fixes #1556 double free (#3347)
parent
b0925b38
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
102 additions
and
2 deletions
+102
-2
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+87
-0
vllm/core/block_manager.py
vllm/core/block_manager.py
+15
-2
No files found.
tests/core/test_block_manager.py
View file @
49a3c866
...
@@ -274,3 +274,90 @@ def test_reset():
...
@@ -274,3 +274,90 @@ def test_reset():
# Resetting block manager frees all allocated blocks.
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_sliding_window_multi_seq
():
"""
Tests that memory allocation and deallocation is handled
correctly with multiple sequences that exceed the sliding
window's capacity.
"""
block_size
=
1
num_cpu_blocks
=
8
num_gpu_blocks
=
8
sliding_window
=
2
block_manager
=
BlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
sliding_window
=
sliding_window
,
watermark
=
0
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
parent
=
Sequence
(
1
,
"one two three"
,
[
0
,
1
,
2
],
block_size
)
seq_group
=
SequenceGroup
(
"1"
,
[
parent
],
SamplingParams
(),
time
.
time
(),
None
)
block_manager
.
allocate
(
seq_group
)
# assert the number of blocks allocated is correct
# the parent seq has len 3, but since sliding_window is 2,
# we will use at most 2 blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# Fork prompt and copy block tables.
child
=
parent
.
fork
(
2
)
block_manager
.
fork
(
parent
,
child
)
# assert the number of blocks allocated is correct
# forking does not increase memory consumption
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# assert both parent and child share all blocks
assert
block_manager
.
get_block_table
(
parent
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
child
)
# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# but only one can be shared
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
token_id
=
5
parent
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
parent
)
# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
block_table_parent
=
block_manager
.
get_block_table
(
parent
)
block_table_child
=
block_manager
.
get_block_table
(
child
)
assert
block_table_parent
!=
block_table_child
# assert both blocks are sharing the second-last block
assert
block_table_parent
[
-
2
]
==
block_table_child
[
-
2
]
# now let's clean up...
block_manager
.
free
(
parent
)
# assert the number of blocks allocated is correct
# We have freed one seq, reducing the ref count of two blocks by one.
# One of the two was only used by the parent seq, so this is now free.
# The child seq still consumes sliding_window blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# free all blocks
block_manager
.
free
(
child
)
# assert all blocks are free now
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
vllm/core/block_manager.py
View file @
49a3c866
...
@@ -312,7 +312,12 @@ class BlockSpaceManager:
...
@@ -312,7 +312,12 @@ class BlockSpaceManager:
# Thus, it is always safe from OOM.
# Thus, it is always safe from OOM.
src_block_table
=
self
.
block_tables
[
parent_seq
.
seq_id
]
src_block_table
=
self
.
block_tables
[
parent_seq
.
seq_id
]
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
copy
()
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
copy
()
for
block
in
src_block_table
:
# When using a sliding window, blocks will be eventually reused.
# In this case the block tables will contain repeated blocks.
# When forking, we must make sure that each block's `ref_count`
# is only incremented by one, so we deduplicate them by wrapping
# them in a set.
for
block
in
set
(
src_block_table
):
block
.
ref_count
+=
1
block
.
ref_count
+=
1
def
_get_physical_blocks
(
def
_get_physical_blocks
(
...
@@ -393,7 +398,15 @@ class BlockSpaceManager:
...
@@ -393,7 +398,15 @@ class BlockSpaceManager:
return
block_number_mapping
return
block_number_mapping
def
_free_block_table
(
self
,
block_table
:
BlockTable
)
->
None
:
def
_free_block_table
(
self
,
block_table
:
BlockTable
)
->
None
:
for
block
in
set
(
block_table
):
# when using a sliding window, each seq will only use up
# to `self.block_sliding_window` blocks. When freeing
# the block table, we must make sure to not free blocks more
# than once. If no sliding window is used, there is no block
# reuse in the block table, so we must free all blocks.
blocks_to_free
=
(
block_table
[
-
self
.
block_sliding_window
:]
if
self
.
block_sliding_window
is
not
None
else
block_table
)
for
block
in
set
(
blocks_to_free
):
if
block
.
device
==
Device
.
GPU
:
if
block
.
device
==
Device
.
GPU
:
self
.
gpu_allocator
.
free
(
block
)
self
.
gpu_allocator
.
free
(
block
)
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment