Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f0ef3723
Unverified
Commit
f0ef3723
authored
Jan 22, 2025
by
Cody Yu
Committed by
GitHub
Jan 23, 2025
Browse files
[V1] Add `uncache_blocks` (#12333)
parent
7551a340
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
2 deletions
+61
-2
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+30
-0
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+31
-2
No files found.
tests/v1/core/test_prefix_caching.py
View file @
f0ef3723
...
...
@@ -626,3 +626,33 @@ def test_reset_prefix_cache():
assert
manager
.
reset_prefix_cache
()
assert
not
manager
.
cached_block_hash_to_block
assert
all
([
blk
.
block_hash
is
None
for
blk
in
manager
.
block_pool
])
def
test_uncache_blocks
():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
req0
=
make_request
(
"0"
,
list
(
range
(
30
)))
blocks
=
manager
.
allocate_slots
(
req0
,
30
,
[])
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
]
assert
len
(
manager
.
cached_block_hash_to_block
)
==
1
req0
.
num_computed_tokens
=
30
# Simulate speculative tokens.
for
_
in
range
(
5
):
req0
.
append_output_token_ids
(
8
)
manager
.
append_slots
(
req0
,
5
)
assert
len
(
manager
.
cached_block_hash_to_block
)
==
2
# After sampling, assuming only 1 token is accepted.
req0
.
num_computed_tokens
=
31
num_uncached_blocks
=
manager
.
uncache_blocks
(
req0
)
assert
num_uncached_blocks
==
1
assert
len
(
manager
.
cached_block_hash_to_block
)
==
1
vllm/v1/core/kv_cache_manager.py
View file @
f0ef3723
...
...
@@ -285,6 +285,29 @@ class KVCacheManager:
if
block
.
ref_cnt
==
0
:
self
.
free_block_queue
.
append
(
block
)
def
uncache_blocks
(
self
,
request
:
Request
)
->
int
:
"""Uncache the blocks that are no longer full based on the
num_computed_tokens in the given request. This happens when
the blocks were full and cached due to speculative tokens, but the
speculative tokens are not accepted.
Args:
request: The request.
Returns:
The number of uncached blocks.
"""
blocks
=
self
.
req_to_blocks
[
request
.
request_id
]
num_computed_tokens
=
request
.
num_computed_tokens
num_full_blocks
=
num_computed_tokens
//
self
.
block_size
num_uncached_blocks
=
0
for
block
in
blocks
[
num_full_blocks
:]:
# If the block is not cached, the following blocks are not cached.
if
not
self
.
_maybe_evict_cached_block
(
block
):
break
num_uncached_blocks
+=
1
return
num_uncached_blocks
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache. This function may be used in RLHF
flows to invalid prefix caching after the weights are updated,
...
...
@@ -386,7 +409,7 @@ class KVCacheManager:
# If the block is cached, evict it.
if
self
.
enable_caching
:
self
.
_evict_cached_block
(
curr_block
)
self
.
_
maybe_
evict_cached_block
(
curr_block
)
curr_block
.
incr_ref
()
ret
.
append
(
curr_block
)
...
...
@@ -394,13 +417,16 @@ class KVCacheManager:
return
ret
def
_evict_cached_block
(
self
,
block
:
KVCacheBlock
)
->
None
:
def
_maybe
_evict_cached_block
(
self
,
block
:
KVCacheBlock
)
->
bool
:
"""
If a block is cached in `cached_block_hash_to_block`, we reset its hash
metadata and evict it from the cache.
Args:
block: The block to evict.
Returns:
True if the block is evicted, False otherwise.
"""
block_hash
=
block
.
block_hash
if
block_hash
and
block_hash
in
self
.
cached_block_hash_to_block
:
...
...
@@ -410,6 +436,9 @@ class KVCacheManager:
if
len
(
self
.
cached_block_hash_to_block
[
block_hash
])
==
0
:
del
self
.
cached_block_hash_to_block
[
block_hash
]
return
True
return
False
def
_get_cached_block
(
self
,
block_hash
:
BlockHashType
)
->
Optional
[
KVCacheBlock
]:
"""Get a cached block by the block hash, or None if cache miss.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment