Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f0ef3723
Unverified
Commit
f0ef3723
authored
Jan 22, 2025
by
Cody Yu
Committed by
GitHub
Jan 23, 2025
Browse files
[V1] Add `uncache_blocks` (#12333)
parent
7551a340
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
2 deletions
+61
-2
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+30
-0
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+31
-2
No files found.
tests/v1/core/test_prefix_caching.py
View file @
f0ef3723
...
@@ -626,3 +626,33 @@ def test_reset_prefix_cache():
...
@@ -626,3 +626,33 @@ def test_reset_prefix_cache():
assert
manager
.
reset_prefix_cache
()
assert
manager
.
reset_prefix_cache
()
assert
not
manager
.
cached_block_hash_to_block
assert
not
manager
.
cached_block_hash_to_block
assert
all
([
blk
.
block_hash
is
None
for
blk
in
manager
.
block_pool
])
assert
all
([
blk
.
block_hash
is
None
for
blk
in
manager
.
block_pool
])
def
test_uncache_blocks
():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
req0
=
make_request
(
"0"
,
list
(
range
(
30
)))
blocks
=
manager
.
allocate_slots
(
req0
,
30
,
[])
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
]
assert
len
(
manager
.
cached_block_hash_to_block
)
==
1
req0
.
num_computed_tokens
=
30
# Simulate speculative tokens.
for
_
in
range
(
5
):
req0
.
append_output_token_ids
(
8
)
manager
.
append_slots
(
req0
,
5
)
assert
len
(
manager
.
cached_block_hash_to_block
)
==
2
# After sampling, assuming only 1 token is accepted.
req0
.
num_computed_tokens
=
31
num_uncached_blocks
=
manager
.
uncache_blocks
(
req0
)
assert
num_uncached_blocks
==
1
assert
len
(
manager
.
cached_block_hash_to_block
)
==
1
vllm/v1/core/kv_cache_manager.py
View file @
f0ef3723
...
@@ -285,6 +285,29 @@ class KVCacheManager:
...
@@ -285,6 +285,29 @@ class KVCacheManager:
if
block
.
ref_cnt
==
0
:
if
block
.
ref_cnt
==
0
:
self
.
free_block_queue
.
append
(
block
)
self
.
free_block_queue
.
append
(
block
)
def
uncache_blocks
(
self
,
request
:
Request
)
->
int
:
"""Uncache the blocks that are no longer full based on the
num_computed_tokens in the given request. This happens when
the blocks were full and cached due to speculative tokens, but the
speculative tokens are not accepted.
Args:
request: The request.
Returns:
The number of uncached blocks.
"""
blocks
=
self
.
req_to_blocks
[
request
.
request_id
]
num_computed_tokens
=
request
.
num_computed_tokens
num_full_blocks
=
num_computed_tokens
//
self
.
block_size
num_uncached_blocks
=
0
for
block
in
blocks
[
num_full_blocks
:]:
# If the block is not cached, the following blocks are not cached.
if
not
self
.
_maybe_evict_cached_block
(
block
):
break
num_uncached_blocks
+=
1
return
num_uncached_blocks
def
reset_prefix_cache
(
self
)
->
bool
:
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache. This function may be used in RLHF
"""Reset prefix cache. This function may be used in RLHF
flows to invalid prefix caching after the weights are updated,
flows to invalid prefix caching after the weights are updated,
...
@@ -386,7 +409,7 @@ class KVCacheManager:
...
@@ -386,7 +409,7 @@ class KVCacheManager:
# If the block is cached, evict it.
# If the block is cached, evict it.
if
self
.
enable_caching
:
if
self
.
enable_caching
:
self
.
_evict_cached_block
(
curr_block
)
self
.
_
maybe_
evict_cached_block
(
curr_block
)
curr_block
.
incr_ref
()
curr_block
.
incr_ref
()
ret
.
append
(
curr_block
)
ret
.
append
(
curr_block
)
...
@@ -394,13 +417,16 @@ class KVCacheManager:
...
@@ -394,13 +417,16 @@ class KVCacheManager:
return
ret
return
ret
def
_evict_cached_block
(
self
,
block
:
KVCacheBlock
)
->
None
:
def
_maybe
_evict_cached_block
(
self
,
block
:
KVCacheBlock
)
->
bool
:
"""
"""
If a block is cached in `cached_block_hash_to_block`, we reset its hash
If a block is cached in `cached_block_hash_to_block`, we reset its hash
metadata and evict it from the cache.
metadata and evict it from the cache.
Args:
Args:
block: The block to evict.
block: The block to evict.
Returns:
True if the block is evicted, False otherwise.
"""
"""
block_hash
=
block
.
block_hash
block_hash
=
block
.
block_hash
if
block_hash
and
block_hash
in
self
.
cached_block_hash_to_block
:
if
block_hash
and
block_hash
in
self
.
cached_block_hash_to_block
:
...
@@ -410,6 +436,9 @@ class KVCacheManager:
...
@@ -410,6 +436,9 @@ class KVCacheManager:
if
len
(
self
.
cached_block_hash_to_block
[
block_hash
])
==
0
:
if
len
(
self
.
cached_block_hash_to_block
[
block_hash
])
==
0
:
del
self
.
cached_block_hash_to_block
[
block_hash
]
del
self
.
cached_block_hash_to_block
[
block_hash
]
return
True
return
False
def
_get_cached_block
(
self
,
def
_get_cached_block
(
self
,
block_hash
:
BlockHashType
)
->
Optional
[
KVCacheBlock
]:
block_hash
:
BlockHashType
)
->
Optional
[
KVCacheBlock
]:
"""Get a cached block by the block hash, or None if cache miss.
"""Get a cached block by the block hash, or None if cache miss.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment