Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b9f1d429
Unverified
Commit
b9f1d429
authored
Mar 01, 2025
by
Chen Zhang
Committed by
GitHub
Mar 01, 2025
Browse files
[v1][Bugfix] Only cache blocks that are not in the prefix cache (#14073)
parent
b28246f6
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
22 deletions
+9
-22
vllm/v1/core/block_pool.py
vllm/v1/core/block_pool.py
+4
-18
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+5
-4
No files found.
vllm/v1/core/block_pool.py
View file @
b9f1d429
...
@@ -107,34 +107,20 @@ class BlockPool:
...
@@ -107,34 +107,20 @@ class BlockPool:
assert
prev_block
.
block_hash
is
not
None
assert
prev_block
.
block_hash
is
not
None
prev_block_hash_value
=
prev_block
.
block_hash
.
hash_value
prev_block_hash_value
=
prev_block
.
block_hash
.
hash_value
# Find the first uncached block.
for
i
,
blk
in
enumerate
(
new_full_blocks
):
# FIXME: num_cached_blocks should be corrected by the caller
# so this should never happen.
offset
=
0
for
blk
in
new_full_blocks
:
if
blk
.
block_hash
is
None
:
break
else
:
prev_block_hash_value
=
blk
.
block_hash
.
hash_value
offset
+=
1
else
:
# All blocks are cached.
return
for
i
,
blk
in
enumerate
(
new_full_blocks
[
offset
:]):
blk_idx
=
num_cached_blocks
+
offset
+
i
assert
blk
.
block_hash
is
None
assert
blk
.
block_hash
is
None
if
i
+
offset
<
len
(
new_block_hashes
):
if
i
<
len
(
new_block_hashes
):
# The block hash may already be computed in
# The block hash may already be computed in
# "get_computed_blocks" if the tokens are not generated by
# "get_computed_blocks" if the tokens are not generated by
# this request (either the prompt tokens or the previously
# this request (either the prompt tokens or the previously
# generated tokens with preemption). In this case we simply
# generated tokens with preemption). In this case we simply
# reuse the block hash.
# reuse the block hash.
block_hash
=
new_block_hashes
[
i
+
offset
]
block_hash
=
new_block_hashes
[
i
]
else
:
else
:
# Otherwise compute the block hash and cache it in the request
# Otherwise compute the block hash and cache it in the request
# in case it will be preempted in the future.
# in case it will be preempted in the future.
blk_idx
=
num_cached_blocks
+
i
start_token_idx
=
blk_idx
*
block_size
start_token_idx
=
blk_idx
*
block_size
end_token_idx
=
(
blk_idx
+
1
)
*
block_size
end_token_idx
=
(
blk_idx
+
1
)
*
block_size
block_tokens
=
request
.
all_token_ids
[
block_tokens
=
request
.
all_token_ids
[
...
...
vllm/v1/core/kv_cache_manager.py
View file @
b9f1d429
...
@@ -65,7 +65,7 @@ class KVCacheManager:
...
@@ -65,7 +65,7 @@ class KVCacheManager:
# This is used to track the number of cached blocks for each request.
# This is used to track the number of cached blocks for each request.
# This is only used to track the RUNNING requests, we do not track the
# This is only used to track the RUNNING requests, we do not track the
# data for reempted ones.
# data for reempted ones.
self
.
num_cached_block
:
Dict
[
str
,
int
]
=
defaultdict
(
int
)
self
.
num_cached_block
:
Dict
[
str
,
int
]
=
{}
self
.
prefix_cache_stats
=
PrefixCacheStats
()
self
.
prefix_cache_stats
=
PrefixCacheStats
()
@
property
@
property
...
@@ -224,9 +224,10 @@ class KVCacheManager:
...
@@ -224,9 +224,10 @@ class KVCacheManager:
if
not
self
.
enable_caching
:
if
not
self
.
enable_caching
:
return
new_blocks
return
new_blocks
# FIXME: `num_cached_blocks` is not correct when the prefix cache
# Use `new_computed_blocks` for a new request, and `num_cached_block`
# of a new request is hit.
# for a running request.
num_cached_blocks
=
self
.
num_cached_block
[
request
.
request_id
]
num_cached_blocks
=
self
.
num_cached_block
.
get
(
request
.
request_id
,
len
(
new_computed_blocks
))
# Speculated tokens might be rejected in the future, so we does
# Speculated tokens might be rejected in the future, so we does
# not cache any speculated tokens. We only cache blocks with
# not cache any speculated tokens. We only cache blocks with
# generated (accepted) tokens.
# generated (accepted) tokens.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment