Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f2ae883b
Unverified
Commit
f2ae883b
authored
May 14, 2025
by
Chen Zhang
Committed by
GitHub
May 13, 2025
Browse files
[v1][KVCacheManager] pass num_new_computed_tokens to kv cache manager (#18001)
Signed-off-by:
Chen Zhang
<
zhangch99@outlook.com
>
parent
40de1ef4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
119 additions
and
53 deletions
+119
-53
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+93
-33
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+6
-10
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+20
-10
No files found.
tests/v1/core/test_prefix_caching.py
View file @
f2ae883b
...
@@ -81,7 +81,9 @@ def test_prefill(hash_algo):
...
@@ -81,7 +81,9 @@ def test_prefill(hash_algo):
assert
len
(
manager
.
req_to_block_hashes
[
req0
.
request_id
])
==
3
assert
len
(
manager
.
req_to_block_hashes
[
req0
.
request_id
])
==
3
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
# Check full block metadata
# Check full block metadata
...
@@ -108,7 +110,9 @@ def test_prefill(hash_algo):
...
@@ -108,7 +110,9 @@ def test_prefill(hash_algo):
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
,
3
]
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
,
3
]
assert
num_computed_tokens
==
3
*
16
assert
num_computed_tokens
==
3
*
16
num_new_tokens
=
53
-
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
5
]
assert
blocks
.
get_block_ids
()
==
[
5
]
for
block
in
computed_blocks
.
blocks
:
for
block
in
computed_blocks
.
blocks
:
assert
block
.
ref_cnt
==
2
assert
block
.
ref_cnt
==
2
...
@@ -140,7 +144,9 @@ def test_prefill(hash_algo):
...
@@ -140,7 +144,9 @@ def test_prefill(hash_algo):
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
,
3
]
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
,
3
]
assert
num_computed_tokens
==
3
*
16
assert
num_computed_tokens
==
3
*
16
num_new_tokens
=
53
-
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req2
,
num_new_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req2
,
num_new_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
6
]
assert
blocks
.
get_block_ids
()
==
[
6
]
# Although we only have 6 free blocks, we have 8 blocks in
# Although we only have 6 free blocks, we have 8 blocks in
...
@@ -161,7 +167,9 @@ def test_prefill(hash_algo):
...
@@ -161,7 +167,9 @@ def test_prefill(hash_algo):
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req3
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req3
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req3
,
16
*
10
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req3
,
16
*
10
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
# This block ID order also checks the eviction order.
# This block ID order also checks the eviction order.
assert
blocks
.
get_block_ids
()
==
[
7
,
8
,
9
,
10
,
4
,
5
,
6
,
3
,
2
,
1
]
assert
blocks
.
get_block_ids
()
==
[
7
,
8
,
9
,
10
,
4
,
5
,
6
,
3
,
2
,
1
]
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
0
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
0
...
@@ -197,7 +205,9 @@ def test_prefill_plp():
...
@@ -197,7 +205,9 @@ def test_prefill_plp():
assert
len
(
manager
.
req_to_block_hashes
[
req0
.
request_id
])
==
0
assert
len
(
manager
.
req_to_block_hashes
[
req0
.
request_id
])
==
0
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
req0_block_hashes
=
[
b
.
block_hash
for
b
in
blocks
.
blocks
]
req0_block_hashes
=
[
b
.
block_hash
for
b
in
blocks
.
blocks
]
...
@@ -226,7 +236,9 @@ def test_prefill_plp():
...
@@ -226,7 +236,9 @@ def test_prefill_plp():
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
,
3
]
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
,
3
]
assert
num_computed_tokens
==
3
*
16
assert
num_computed_tokens
==
3
*
16
num_new_tokens
=
53
-
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
5
]
assert
blocks
.
get_block_ids
()
==
[
5
]
for
block
in
computed_blocks
.
blocks
:
for
block
in
computed_blocks
.
blocks
:
assert
block
.
ref_cnt
==
2
assert
block
.
ref_cnt
==
2
...
@@ -259,7 +271,9 @@ def test_prefill_plp():
...
@@ -259,7 +271,9 @@ def test_prefill_plp():
assert
len
(
manager
.
req_to_block_hashes
[
req2
.
request_id
])
==
0
assert
len
(
manager
.
req_to_block_hashes
[
req2
.
request_id
])
==
0
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req2
,
55
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req2
,
55
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
block_ids
=
blocks
.
get_block_ids
()
block_ids
=
blocks
.
get_block_ids
()
# Duplicate cached blocks have different ids but same hashes vs request #0
# Duplicate cached blocks have different ids but same hashes vs request #0
assert
[
b
.
block_hash
for
b
in
blocks
.
blocks
]
==
req0_block_hashes
assert
[
b
.
block_hash
for
b
in
blocks
.
blocks
]
==
req0_block_hashes
...
@@ -290,14 +304,18 @@ def test_decode():
...
@@ -290,14 +304,18 @@ def test_decode():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
# Append slots without allocating a new block.
# Append slots without allocating a new block.
req0
.
num_computed_tokens
=
55
req0
.
num_computed_tokens
=
55
for
_
in
range
(
4
):
for
_
in
range
(
4
):
req0
.
append_output_token_ids
(
8
)
req0
.
append_output_token_ids
(
8
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
4
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
4
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
0
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
0
assert
manager
.
single_type_manager
.
req_to_blocks
[
assert
manager
.
single_type_manager
.
req_to_blocks
[
req0
.
request_id
][
-
1
].
block_hash
is
None
req0
.
request_id
][
-
1
].
block_hash
is
None
...
@@ -308,7 +326,9 @@ def test_decode():
...
@@ -308,7 +326,9 @@ def test_decode():
# the preallocated block.
# the preallocated block.
for
_
in
range
(
9
+
10
):
for
_
in
range
(
9
+
10
):
req0
.
append_output_token_ids
(
7
)
req0
.
append_output_token_ids
(
7
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
19
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
19
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
1
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
1
assert
manager
.
single_type_manager
.
req_to_blocks
[
assert
manager
.
single_type_manager
.
req_to_blocks
[
req0
.
request_id
][
-
2
].
block_hash
is
not
None
req0
.
request_id
][
-
2
].
block_hash
is
not
None
...
@@ -328,7 +348,9 @@ def test_evict():
...
@@ -328,7 +348,9 @@ def test_evict():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
5
*
16
+
7
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
5
*
16
+
7
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
6
# 5 full + 1 partial
assert
len
(
blocks
.
blocks
)
==
6
# 5 full + 1 partial
# 3 blocks.
# 3 blocks.
...
@@ -337,7 +359,9 @@ def test_evict():
...
@@ -337,7 +359,9 @@ def test_evict():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req1
,
3
*
16
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
3
*
16
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
3
# 3 full blocks
assert
len
(
blocks
.
blocks
)
==
3
# 3 full blocks
last_token_id
+=
3
*
16
last_token_id
+=
3
*
16
...
@@ -357,7 +381,9 @@ def test_evict():
...
@@ -357,7 +381,9 @@ def test_evict():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
]
assert
computed_blocks
.
get_block_ids
()
==
[
1
,
2
]
assert
num_computed_tokens
==
2
*
16
assert
num_computed_tokens
==
2
*
16
blocks
=
manager
.
allocate_slots
(
req2
,
3
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req2
,
3
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
10
]
assert
blocks
.
get_block_ids
()
==
[
10
]
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
7
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
7
...
@@ -380,7 +406,9 @@ def test_hash_block_correct_reuse():
...
@@ -380,7 +406,9 @@ def test_hash_block_correct_reuse():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req
,
num_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req
,
num_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
1
assert
len
(
blocks
.
blocks
)
==
1
# Deallocate the block.
# Deallocate the block.
...
@@ -392,7 +420,9 @@ def test_hash_block_correct_reuse():
...
@@ -392,7 +420,9 @@ def test_hash_block_correct_reuse():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req
,
num_tokens
-
1
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req
,
num_tokens
-
1
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
1
assert
len
(
blocks
.
blocks
)
==
1
assert
manager
.
block_pool
.
blocks
[
assert
manager
.
block_pool
.
blocks
[
...
@@ -417,7 +447,9 @@ def test_computed_blocks_not_evicted():
...
@@ -417,7 +447,9 @@ def test_computed_blocks_not_evicted():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
num_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
num_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
1
assert
len
(
blocks
.
blocks
)
==
1
assert
blocks
.
blocks
[
0
].
block_id
==
1
assert
blocks
.
blocks
[
0
].
block_id
==
1
...
@@ -426,7 +458,9 @@ def test_computed_blocks_not_evicted():
...
@@ -426,7 +458,9 @@ def test_computed_blocks_not_evicted():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req1
,
num_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
num_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
1
assert
len
(
blocks
.
blocks
)
==
1
assert
blocks
.
blocks
[
0
].
block_id
==
2
assert
blocks
.
blocks
[
0
].
block_id
==
2
...
@@ -443,6 +477,7 @@ def test_computed_blocks_not_evicted():
...
@@ -443,6 +477,7 @@ def test_computed_blocks_not_evicted():
assert
num_computed_tokens
==
block_size
assert
num_computed_tokens
==
block_size
blocks
=
manager
.
allocate_slots
(
req2
,
num_tokens
*
2
-
num_tokens
,
blocks
=
manager
.
allocate_slots
(
req2
,
num_tokens
*
2
-
num_tokens
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
1
assert
len
(
blocks
.
blocks
)
==
1
assert
blocks
.
blocks
[
0
].
block_id
==
2
assert
blocks
.
blocks
[
0
].
block_id
==
2
...
@@ -464,7 +499,9 @@ def test_basic_prefix_caching_disabled():
...
@@ -464,7 +499,9 @@ def test_basic_prefix_caching_disabled():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req1
,
10
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
10
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
3
assert
len
(
blocks
.
blocks
)
==
3
# Free the blocks.
# Free the blocks.
...
@@ -475,7 +512,9 @@ def test_basic_prefix_caching_disabled():
...
@@ -475,7 +512,9 @@ def test_basic_prefix_caching_disabled():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req2
,
16
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req2
,
16
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
len
(
blocks
.
blocks
)
==
4
assert
len
(
blocks
.
blocks
)
==
4
# New requests should not have any blocks.
# New requests should not have any blocks.
...
@@ -483,7 +522,9 @@ def test_basic_prefix_caching_disabled():
...
@@ -483,7 +522,9 @@ def test_basic_prefix_caching_disabled():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req3
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req3
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req3
,
4
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req3
,
4
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
not
blocks
assert
not
blocks
...
@@ -581,14 +622,18 @@ def test_mm_prefix_caching():
...
@@ -581,14 +622,18 @@ def test_mm_prefix_caching():
assert
block_hashes
[
1
].
extra_keys
==
(
"aaa"
,
"bbb"
)
assert
block_hashes
[
1
].
extra_keys
==
(
"aaa"
,
"bbb"
)
assert
block_hashes
[
2
].
extra_keys
==
(
"bbb"
,
)
assert
block_hashes
[
2
].
extra_keys
==
(
"bbb"
,
)
blocks
=
manager
.
allocate_slots
(
req0
,
59
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
59
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
req0
.
num_computed_tokens
=
59
req0
.
num_computed_tokens
=
59
# Append slots without allocating a new block.
# Append slots without allocating a new block.
for
_
in
range
(
5
):
for
_
in
range
(
5
):
req0
.
append_output_token_ids
(
8
)
req0
.
append_output_token_ids
(
8
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
5
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
5
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
0
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
0
# The just completed block should have hashes with extra keys.
# The just completed block should have hashes with extra keys.
...
@@ -638,14 +683,18 @@ def test_cache_key_salting():
...
@@ -638,14 +683,18 @@ def test_cache_key_salting():
assert
block_hashes
[
1
].
extra_keys
is
None
assert
block_hashes
[
1
].
extra_keys
is
None
assert
block_hashes
[
2
].
extra_keys
is
None
assert
block_hashes
[
2
].
extra_keys
is
None
blocks
=
manager
.
allocate_slots
(
req0
,
59
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
59
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
assert
blocks
.
get_block_ids
()
==
[
1
,
2
,
3
,
4
]
req0
.
num_computed_tokens
=
59
req0
.
num_computed_tokens
=
59
# Append slots without allocating a new block.
# Append slots without allocating a new block.
for
_
in
range
(
5
):
for
_
in
range
(
5
):
req0
.
append_output_token_ids
(
8
)
req0
.
append_output_token_ids
(
8
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
5
)
new_blocks
=
manager
.
allocate_slots
(
req0
,
5
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
0
assert
new_blocks
is
not
None
and
len
(
new_blocks
.
blocks
)
==
0
# Now one more block that should not have extra keys.
# Now one more block that should not have extra keys.
...
@@ -691,7 +740,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
...
@@ -691,7 +740,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req0
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
manager
.
allocate_slots
(
req0
,
48
,
computed_blocks
)
manager
.
allocate_slots
(
req0
,
48
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
block_part0
=
manager
.
single_type_manager
.
req_to_blocks
[
req0
.
request_id
]
block_part0
=
manager
.
single_type_manager
.
req_to_blocks
[
req0
.
request_id
]
# | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
# | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
...
@@ -699,7 +749,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
...
@@ -699,7 +749,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
assert
computed_blocks
.
blocks
==
block_part0
assert
computed_blocks
.
blocks
==
block_part0
assert
num_computed_tokens
==
3
*
16
assert
num_computed_tokens
==
3
*
16
manager
.
allocate_slots
(
req1
,
48
,
computed_blocks
)
manager
.
allocate_slots
(
req1
,
48
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
block_part1
=
manager
.
single_type_manager
.
req_to_blocks
[
req1
.
request_id
]
block_part1
=
manager
.
single_type_manager
.
req_to_blocks
[
req1
.
request_id
]
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
# | Req1-5(F)| ... |
# | Req1-5(F)| ... |
...
@@ -713,7 +764,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
...
@@ -713,7 +764,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
manager
.
allocate_slots
(
req2
,
block_size
*
2
,
computed_blocks
)
manager
.
allocate_slots
(
req2
,
block_size
*
2
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
# Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
# Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
# but it cannot be allocated due to insufficient free blocks (2).
# but it cannot be allocated due to insufficient free blocks (2).
...
@@ -724,7 +776,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
...
@@ -724,7 +776,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
assert
computed_blocks
.
blocks
==
block_part1
assert
computed_blocks
.
blocks
==
block_part1
assert
num_computed_tokens
==
6
*
16
assert
num_computed_tokens
==
6
*
16
# Req3 cannot be allocated.
# Req3 cannot be allocated.
assert
manager
.
allocate_slots
(
req3
,
48
,
computed_blocks
)
is
None
assert
manager
.
allocate_slots
(
req3
,
48
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
is
None
# Block 0-2 are used by Req 1.
# Block 0-2 are used by Req 1.
assert
{
block
.
ref_cnt
for
block
in
block_part1
[:
3
]}
==
{
1
}
assert
{
block
.
ref_cnt
for
block
in
block_part1
[:
3
]}
==
{
1
}
# Block 3-5 are free.
# Block 3-5 are free.
...
@@ -751,7 +805,9 @@ def test_reset_prefix_cache():
...
@@ -751,7 +805,9 @@ def test_reset_prefix_cache():
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req1
)
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req1
)
assert
len
(
manager
.
req_to_block_hashes
[
req1
.
request_id
])
==
3
assert
len
(
manager
.
req_to_block_hashes
[
req1
.
request_id
])
==
3
assert
len
(
computed_blocks
.
blocks
)
==
3
assert
len
(
computed_blocks
.
blocks
)
==
3
blocks
=
manager
.
allocate_slots
(
req1
,
7
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
7
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
assert
blocks
.
get_block_ids
()
==
[
5
]
assert
blocks
.
get_block_ids
()
==
[
5
]
# Failed to reset prefix cache because some blocks are not freed yet.
# Failed to reset prefix cache because some blocks are not freed yet.
...
@@ -782,7 +838,8 @@ def test_prefix_cache_stats_disabled():
...
@@ -782,7 +838,8 @@ def test_prefix_cache_stats_disabled():
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req
)
assert
not
computed_blocks
.
blocks
assert
not
computed_blocks
.
blocks
assert
num_computed_tokens
==
0
assert
num_computed_tokens
==
0
manager
.
allocate_slots
(
req
,
16
,
computed_blocks
)
manager
.
allocate_slots
(
req
,
16
,
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
manager
.
reset_prefix_cache
()
manager
.
reset_prefix_cache
()
# Ensure prefix_cache_stats remains None
# Ensure prefix_cache_stats remains None
...
@@ -860,7 +917,8 @@ def test_eagle_enabled_removes_last_block():
...
@@ -860,7 +917,8 @@ def test_eagle_enabled_removes_last_block():
# Prime the cache
# Prime the cache
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req
)
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req
)
manager
.
allocate_slots
(
req
,
len
(
token_ids
),
computed_blocks
)
manager
.
allocate_slots
(
req
,
len
(
token_ids
),
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
manager
.
free
(
req
)
manager
.
free
(
req
)
# New request with same tokens + Eagle enabled
# New request with same tokens + Eagle enabled
...
@@ -889,7 +947,8 @@ def test_eagle_with_partial_blocks():
...
@@ -889,7 +947,8 @@ def test_eagle_with_partial_blocks():
# Prime the cache
# Prime the cache
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req
)
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req
)
manager
.
allocate_slots
(
req
,
len
(
token_ids
),
computed_blocks
)
manager
.
allocate_slots
(
req
,
len
(
token_ids
),
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
manager
.
free
(
req
)
manager
.
free
(
req
)
# New request with Eagle enabled
# New request with Eagle enabled
...
@@ -928,7 +987,8 @@ def test_eagle_with_sliding_window():
...
@@ -928,7 +987,8 @@ def test_eagle_with_sliding_window():
# Prime the cache
# Prime the cache
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req
)
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req
)
manager
.
allocate_slots
(
req
,
len
(
token_ids
),
computed_blocks
)
manager
.
allocate_slots
(
req
,
len
(
token_ids
),
len
(
computed_blocks
.
blocks
)
*
16
,
computed_blocks
)
# record the block hash of the first block in the request for later use
# record the block hash of the first block in the request for later use
block_hash_first_block
=
manager
.
req_to_block_hashes
[
req
.
request_id
][
0
]
block_hash_first_block
=
manager
.
req_to_block_hashes
[
req
.
request_id
][
0
]
assert
block_hash_first_block
is
not
None
assert
block_hash_first_block
is
not
None
...
...
vllm/v1/core/kv_cache_manager.py
View file @
f2ae883b
...
@@ -121,13 +121,6 @@ class KVCacheManager:
...
@@ -121,13 +121,6 @@ class KVCacheManager:
- A list of blocks that are computed for the request.
- A list of blocks that are computed for the request.
- The number of computed tokens.
- The number of computed tokens.
"""
"""
# Request already has blocks from async load via KVConnector.
num_existing_blocks
=
len
(
self
.
single_type_manager
.
req_to_blocks
[
request
.
request_id
])
if
num_existing_blocks
>
0
:
return
KVCacheBlocks
.
create_empty
(),
request
.
num_computed_tokens
# Prefix caching is disabled or
# Prefix caching is disabled or
# When the request requires prompt logprobs, we skip prefix caching.
# When the request requires prompt logprobs, we skip prefix caching.
if
(
not
self
.
enable_caching
if
(
not
self
.
enable_caching
...
@@ -172,6 +165,7 @@ class KVCacheManager:
...
@@ -172,6 +165,7 @@ class KVCacheManager:
self
,
self
,
request
:
Request
,
request
:
Request
,
num_new_tokens
:
int
,
num_new_tokens
:
int
,
num_new_computed_tokens
:
int
=
0
,
new_computed_blocks
:
Optional
[
KVCacheBlocks
]
=
None
,
new_computed_blocks
:
Optional
[
KVCacheBlocks
]
=
None
,
num_lookahead_tokens
:
int
=
0
,
num_lookahead_tokens
:
int
=
0
,
delay_cache_blocks
:
bool
=
False
,
delay_cache_blocks
:
bool
=
False
,
...
@@ -183,8 +177,10 @@ class KVCacheManager:
...
@@ -183,8 +177,10 @@ class KVCacheManager:
num_new_tokens: The number of tokens to allocate, including external
num_new_tokens: The number of tokens to allocate, including external
tokens. Note that this does not include tokens that have
tokens. Note that this does not include tokens that have
already been computed locally (i.e. new_computed_blocks).
already been computed locally (i.e. new_computed_blocks).
new_computed_blocks: The new computed blocks just hitting the
num_new_computed_tokens: The number of new computed tokens just
prefix caching.
hitting the prefix caching, excluding external tokens.
new_computed_blocks: The cached blocks for the above new computed
tokens.
num_lookahead_tokens: The number of speculative tokens to allocate.
num_lookahead_tokens: The number of speculative tokens to allocate.
This is used by spec decode proposers with kv-cache such
This is used by spec decode proposers with kv-cache such
as eagle.
as eagle.
...
@@ -229,7 +225,7 @@ class KVCacheManager:
...
@@ -229,7 +225,7 @@ class KVCacheManager:
# The number of computed tokens is the number of computed tokens plus
# The number of computed tokens is the number of computed tokens plus
# the new prefix caching hits
# the new prefix caching hits
num_computed_tokens
=
(
request
.
num_computed_tokens
+
num_computed_tokens
=
(
request
.
num_computed_tokens
+
len
(
new_computed_
block_list
)
*
self
.
block_size
)
num_
new_computed_
tokens
)
num_tokens_need_slot
=
min
(
num_tokens_need_slot
=
min
(
num_computed_tokens
+
num_new_tokens
+
num_lookahead_tokens
,
num_computed_tokens
+
num_new_tokens
+
num_lookahead_tokens
,
self
.
max_model_len
)
self
.
max_model_len
)
...
...
vllm/v1/core/sched/scheduler.py
View file @
f2ae883b
...
@@ -18,7 +18,7 @@ from vllm.logger import init_logger
...
@@ -18,7 +18,7 @@ from vllm.logger import init_logger
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.v1.core.encoder_cache_manager
import
(
EncoderCacheManager
,
from
vllm.v1.core.encoder_cache_manager
import
(
EncoderCacheManager
,
compute_encoder_budget
)
compute_encoder_budget
)
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
,
KVCacheManager
from
vllm.v1.core.sched.interface
import
SchedulerInterface
from
vllm.v1.core.sched.interface
import
SchedulerInterface
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
SchedulerOutput
)
SchedulerOutput
)
...
@@ -311,12 +311,14 @@ class Scheduler(SchedulerInterface):
...
@@ -311,12 +311,14 @@ class Scheduler(SchedulerInterface):
break
break
request
=
self
.
waiting
[
0
]
request
=
self
.
waiting
[
0
]
num_prealloc_computed_tokens
=
0
# P/D: skip request if still waiting for remote kvs.
# P/D: skip request if still waiting for remote kvs.
if
request
.
status
==
RequestStatus
.
WAITING_FOR_REMOTE_KVS
:
if
request
.
status
==
RequestStatus
.
WAITING_FOR_REMOTE_KVS
:
is_ready
=
self
.
_update_waiting_for_remote_kv
(
request
)
is_ready
=
self
.
_update_waiting_for_remote_kv
(
request
)
if
is_ready
:
if
is_ready
:
request
.
status
=
RequestStatus
.
WAITING
request
.
status
=
RequestStatus
.
WAITING
num_prealloc_computed_tokens
=
(
request
.
num_computed_tokens
)
else
:
else
:
self
.
waiting
.
popleft
()
self
.
waiting
.
popleft
()
skipped_waiting_requests
.
appendleft
(
request
)
skipped_waiting_requests
.
appendleft
(
request
)
...
@@ -345,18 +347,25 @@ class Scheduler(SchedulerInterface):
...
@@ -345,18 +347,25 @@ class Scheduler(SchedulerInterface):
continue
continue
# Get already-cached tokens.
# Get already-cached tokens.
new_computed_blocks
,
num_computed_tokens
=
\
if
num_prealloc_computed_tokens
==
0
:
self
.
kv_cache_manager
.
get_computed_blocks
(
new_computed_blocks
,
num_native_computed_tokens
=
\
request
)
self
.
kv_cache_manager
.
get_computed_blocks
(
request
)
else
:
# P/D: skip checking prefix cache if loaded from remote kvs.
new_computed_blocks
=
KVCacheBlocks
.
create_empty
()
num_native_computed_tokens
=
0
# Get externally-cached tokens if using a KVConnector.
# Get externally-cached tokens if using a KVConnector.
num_external_tokens
,
load_kv_async
=
(
num_external_
computed_
tokens
,
load_kv_async
=
(
(
0
,
False
)
if
self
.
connector
is
None
else
(
0
,
False
)
if
self
.
connector
is
None
else
self
.
connector
.
get_num_new_matched_tokens
(
self
.
connector
.
get_num_new_matched_tokens
(
request
,
num_computed_tokens
))
request
,
num_
native_
computed_tokens
))
# Total computed tokens (local + external).
# Total computed tokens (local + external).
num_computed_tokens
+=
num_external_tokens
num_computed_tokens
=
(
num_native_computed_tokens
+
num_external_computed_tokens
+
num_prealloc_computed_tokens
)
encoder_inputs_to_schedule
=
None
encoder_inputs_to_schedule
=
None
new_encoder_budget
=
encoder_budget
new_encoder_budget
=
encoder_budget
...
@@ -390,7 +399,8 @@ class Scheduler(SchedulerInterface):
...
@@ -390,7 +399,8 @@ class Scheduler(SchedulerInterface):
new_blocks
=
self
.
kv_cache_manager
.
allocate_slots
(
new_blocks
=
self
.
kv_cache_manager
.
allocate_slots
(
request
,
request
,
num_new_tokens
+
num_external_tokens
,
num_new_tokens
+
num_external_computed_tokens
,
num_native_computed_tokens
,
new_computed_blocks
,
new_computed_blocks
,
num_lookahead_tokens
=
self
.
num_lookahead_tokens
,
num_lookahead_tokens
=
self
.
num_lookahead_tokens
,
delay_cache_blocks
=
load_kv_async
,
delay_cache_blocks
=
load_kv_async
,
...
@@ -406,7 +416,7 @@ class Scheduler(SchedulerInterface):
...
@@ -406,7 +416,7 @@ class Scheduler(SchedulerInterface):
self
.
connector
.
update_state_after_alloc
(
self
.
connector
.
update_state_after_alloc
(
request
,
request
,
new_computed_blocks
+
new_blocks
,
new_computed_blocks
+
new_blocks
,
num_external_tokens
,
num_external_
computed_
tokens
,
)
)
self
.
waiting
.
popleft
()
self
.
waiting
.
popleft
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment