Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3d5f1c86
Unverified
Commit
3d5f1c86
authored
Oct 02, 2025
by
Chen Zhang
Committed by
GitHub
Oct 02, 2025
Browse files
[Mamba][KVCacheManager] Simplify kv cache manage logic for mamba + MTP (#25119)
Signed-off-by:
Chen Zhang
<
zhangch99@outlook.com
>
parent
1cab2f9c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
25 deletions
+4
-25
vllm/v1/core/single_type_kv_cache_manager.py
vllm/v1/core/single_type_kv_cache_manager.py
+4
-25
No files found.
vllm/v1/core/single_type_kv_cache_manager.py
View file @
3d5f1c86
...
@@ -565,35 +565,14 @@ class MambaManager(SingleTypeKVCacheManager):
...
@@ -565,35 +565,14 @@ class MambaManager(SingleTypeKVCacheManager):
def
get_num_blocks_to_allocate
(
def
get_num_blocks_to_allocate
(
self
,
request_id
:
str
,
num_tokens
:
int
,
self
,
request_id
:
str
,
num_tokens
:
int
,
new_computed_blocks
:
list
[
KVCacheBlock
])
->
int
:
new_computed_blocks
:
list
[
KVCacheBlock
])
->
int
:
"""
# Allocate extra `num_speculative_blocks` blocks for
Get the number of blocks needed to be allocated for the request.
# speculative decoding (MTP/EAGLE) with linear attention.
Args:
request_id: The request ID.
num_tokens: The total number of tokens that need a slot (including
tokens that are already allocated).
new_computed_blocks: The new computed blocks just hitting the
prefix caching.
Returns:
The number of blocks
"""
assert
isinstance
(
self
.
kv_cache_spec
,
MambaSpec
)
assert
isinstance
(
self
.
kv_cache_spec
,
MambaSpec
)
if
self
.
kv_cache_spec
.
num_speculative_blocks
>
0
:
if
self
.
kv_cache_spec
.
num_speculative_blocks
>
0
:
num_tokens
+=
(
self
.
kv_cache_spec
.
block_size
*
num_tokens
+=
(
self
.
kv_cache_spec
.
block_size
*
self
.
kv_cache_spec
.
num_speculative_blocks
)
self
.
kv_cache_spec
.
num_speculative_blocks
)
num_required_blocks
=
cdiv
(
num_tokens
,
self
.
block_size
)
return
super
().
get_num_blocks_to_allocate
(
request_id
,
num_tokens
,
num_new_blocks
=
(
num_required_blocks
-
len
(
new_computed_blocks
)
-
new_computed_blocks
)
len
(
self
.
req_to_blocks
[
request_id
]))
# If a computed block of a request is an eviction candidate (in the
# free queue and ref_cnt == 0), it will be changed from a free block
# to a computed block when the request is allocated, so we also count
# it as needed to be allocated.
num_evictable_computed_blocks
=
sum
(
blk
.
ref_cnt
==
0
and
not
blk
.
is_null
for
blk
in
new_computed_blocks
)
return
num_new_blocks
+
num_evictable_computed_blocks
def
allocate_new_blocks
(
self
,
request_id
:
str
,
def
allocate_new_blocks
(
self
,
request_id
:
str
,
num_tokens
:
int
)
->
list
[
KVCacheBlock
]:
num_tokens
:
int
)
->
list
[
KVCacheBlock
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment