Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5f52a846
Unverified
Commit
5f52a846
authored
Jun 18, 2025
by
Russell Bryant
Committed by
GitHub
Jun 18, 2025
Browse files
[V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
d4629dc4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
106 additions
and
2 deletions
+106
-2
vllm/v1/core/encoder_cache_manager.py
vllm/v1/core/encoder_cache_manager.py
+106
-2
No files found.
vllm/v1/core/encoder_cache_manager.py
View file @
5f52a846
...
...
@@ -14,6 +14,39 @@ logger = init_logger(__name__)
class
EncoderCacheManager
:
"""Manages caching of encoder outputs for multimodal models in vLLM V1.
The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
(such as vision embeddings from images) during request processing. It
provides memory-aware caching to avoid recomputing encoder outputs when the
same multimodal inputs appear in different stages of request processing.
This manager is particularly important for:
- Vision-language models (e.g., LLaVA) where image encoder outputs are
cached
- Any multimodal model where encoder computation is expensive and
cacheable
The cache operates at the granularity of individual multimodal input items
within requests, allowing for fine-grained memory management and enabling
chunked processing of multimodal inputs.
Note that no caching is shared between requests at this time. If the same
input is used across multiple requests, it will be reprocessed for each
request.
Args:
cache_size: Limit the size of the cache, measured by the number of
tokens from the input sequence.
Attributes:
cache_size: Total cache capacity in encoder tokens
num_free_slots: Current available cache capacity in encoder tokens
cached: Mapping from request_id to set of cached input_ids for that
request
freed: List of (request_id, input_id) pairs that were recently freed.
This is cleared after every call to get_freed_ids().
"""
def
__init__
(
self
,
cache_size
:
int
):
self
.
cache_size
=
cache_size
...
...
@@ -24,14 +57,48 @@ class EncoderCacheManager:
self
.
freed
:
list
[
tuple
[
str
,
int
]]
=
[]
def
has_cache
(
self
,
request
:
Request
,
input_id
:
int
)
->
bool
:
"""Check if encoder output for a specific multimodal input is cached.
Args:
request: The request containing the multimodal input
input_id: Index of the multimodal input within the request
Returns:
True if the encoder output for this input is already cached
"""
req_id
=
request
.
request_id
return
req_id
in
self
.
cached
and
input_id
in
self
.
cached
[
req_id
]
def
can_allocate
(
self
,
request
:
Request
,
input_id
:
int
)
->
bool
:
"""Check if there's sufficient cache space for a multimodal input.
Args:
request: The request containing the multimodal input
input_id: Index of the multimodal input within the request
Returns:
True if there's enough free cache space to store the encoder output
for this multimodal input
"""
num_tokens
=
request
.
get_num_encoder_tokens
(
input_id
)
return
num_tokens
<=
self
.
num_free_slots
def
allocate
(
self
,
request
:
Request
,
input_id
:
int
)
->
None
:
"""Allocate cache space for a multimodal input's encoder output.
This method reserves cache space for storing the encoder output of
the specified multimodal input. The actual encoder output storage
happens in the model runner, but this method ensures the cache
manager tracks the allocation.
Args:
request: The request containing the multimodal input
input_id: Index of the multimodal input within the request
Note:
This method assumes can_allocate() returned True for the same
request and input_id. It will reduce available cache space.
"""
req_id
=
request
.
request_id
if
req_id
not
in
self
.
cached
:
self
.
cached
[
req_id
]
=
set
()
...
...
@@ -39,10 +106,30 @@ class EncoderCacheManager:
self
.
num_free_slots
-=
request
.
get_num_encoder_tokens
(
input_id
)
def
get_cached_input_ids
(
self
,
request
:
Request
)
->
set
[
int
]:
"""Get all cached multimodal input IDs for a request.
Args:
request: The request to query
Returns:
Set of input_ids that have cached encoder outputs for this request.
Returns empty set if no inputs are cached for this request.
"""
return
self
.
cached
.
get
(
request
.
request_id
,
set
())
def
free_encoder_input
(
self
,
request
:
Request
,
input_id
:
int
)
->
None
:
"""Free a single encoder input id for the request."""
"""Free cache space for a single multimodal input's encoder output.
This method is called when:
- The encoder output has been fully consumed by the decoder and is
no longer needed (e.g., in vision-language models after image
tokens are processed)
- A request is being cancelled or aborted
Args:
request: The request containing the multimodal input
input_id: Index of the multimodal input to free from cache
"""
req_id
=
request
.
request_id
if
req_id
not
in
self
.
cached
:
return
...
...
@@ -54,12 +141,29 @@ class EncoderCacheManager:
self
.
freed
.
append
((
req_id
,
input_id
))
def
free
(
self
,
request
:
Request
)
->
None
:
"""Free all cached input ids for the request."""
"""Free all cached encoder outputs for a request.
This method is typically called when a request is finished, cancelled,
or aborted, and all its encoder outputs should be freed from cache.
Args:
request: The request whose encoder outputs should be freed
"""
input_ids
=
self
.
get_cached_input_ids
(
request
).
copy
()
for
input_id
in
input_ids
:
self
.
free_encoder_input
(
request
,
input_id
)
def
get_freed_ids
(
self
)
->
list
[
tuple
[
str
,
int
]]:
"""Get and clear the list of recently freed encoder cache entries.
This method returns all encoder cache entries that were freed since
the last call to this method. It's used by the scheduler to notify
workers about which encoder outputs can be removed from their caches.
Returns:
List of (request_id, input_id) tuples that were freed since the
last call. The internal freed list is cleared after this call.
"""
freed
=
self
.
freed
self
.
freed
=
[]
return
freed
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment