[V1] Add API docs for EncoderCacheManager (#19294)

Signed-off-by: Russell Bryant <rbryant@redhat.com>

[V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
5f52a846 · Russell Bryant · GitHub · d4629dc4 · 5f52a846
Unverified Commit 5f52a846 authored Jun 18, 2025 by Russell Bryant Committed by GitHub Jun 18, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 106 additions and 2 deletions

vllm/v1/core/encoder_cache_manager.py vllm/v1/core/encoder_cache_manager.py +106 -2

No files found.
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -14,6 +14,39 @@ logger = init_logger(__name__)


 class EncoderCacheManager:
+    """Manages caching of encoder outputs for multimodal models in vLLM V1.
+
+    The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
+    (such as vision embeddings from images) during request processing. It
+    provides memory-aware caching to avoid recomputing encoder outputs when the
+    same multimodal inputs appear in different stages of request processing.
+
+    This manager is particularly important for:
+    - Vision-language models (e.g., LLaVA) where image encoder outputs are
+      cached
+    - Any multimodal model where encoder computation is expensive and
+      cacheable
+
+    The cache operates at the granularity of individual multimodal input items
+    within requests, allowing for fine-grained memory management and enabling
+    chunked processing of multimodal inputs.
+
+    Note that no caching is shared between requests at this time. If the same
+    input is used across multiple requests, it will be reprocessed for each
+    request.
+    
+    Args:
+        cache_size: Limit the size of the cache, measured by the number of
+                    tokens from the input sequence.
+
+    Attributes:
+        cache_size: Total cache capacity in encoder tokens
+        num_free_slots: Current available cache capacity in encoder tokens
+        cached: Mapping from request_id to set of cached input_ids for that
+                request
+        freed: List of (request_id, input_id) pairs that were recently freed.
+               This is cleared after every call to get_freed_ids().
+    """

    def __init__(self, cache_size: int):
        self.cache_size = cache_size
@@ -24,14 +57,48 @@ class EncoderCacheManager:
        self.freed: list[tuple[str, int]] = []

    def has_cache(self, request: Request, input_id: int) -> bool:
+        """Check if encoder output for a specific multimodal input is cached.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Returns:
+            True if the encoder output for this input is already cached
+        """
        req_id = request.request_id
        return req_id in self.cached and input_id in self.cached[req_id]

    def can_allocate(self, request: Request, input_id: int) -> bool:
+        """Check if there's sufficient cache space for a multimodal input.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Returns:
+            True if there's enough free cache space to store the encoder output
+            for this multimodal input
+        """
        num_tokens = request.get_num_encoder_tokens(input_id)
        return num_tokens <= self.num_free_slots

    def allocate(self, request: Request, input_id: int) -> None:
+        """Allocate cache space for a multimodal input's encoder output.
+
+        This method reserves cache space for storing the encoder output of
+        the specified multimodal input. The actual encoder output storage
+        happens in the model runner, but this method ensures the cache
+        manager tracks the allocation.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Note:
+            This method assumes can_allocate() returned True for the same
+            request and input_id. It will reduce available cache space.
+        """
        req_id = request.request_id
        if req_id not in self.cached:
            self.cached[req_id] = set()
@@ -39,10 +106,30 @@ class EncoderCacheManager:
        self.num_free_slots -= request.get_num_encoder_tokens(input_id)

    def get_cached_input_ids(self, request: Request) -> set[int]:
+        """Get all cached multimodal input IDs for a request.
+
+        Args:
+            request: The request to query
+
+        Returns:
+            Set of input_ids that have cached encoder outputs for this request.
+            Returns empty set if no inputs are cached for this request.
+        """
        return self.cached.get(request.request_id, set())

    def free_encoder_input(self, request: Request, input_id: int) -> None:
-        """Free a single encoder input id for the request."""
+        """Free cache space for a single multimodal input's encoder output.
+
+        This method is called when:
+        - The encoder output has been fully consumed by the decoder and is
+          no longer needed (e.g., in vision-language models after image
+          tokens are processed)
+        - A request is being cancelled or aborted
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input to free from cache
+        """
        req_id = request.request_id
        if req_id not in self.cached:
            return
@@ -54,12 +141,29 @@ class EncoderCacheManager:
        self.freed.append((req_id, input_id))

    def free(self, request: Request) -> None:
-        """Free all cached input ids for the request."""
+        """Free all cached encoder outputs for a request.
+
+        This method is typically called when a request is finished, cancelled,
+        or aborted, and all its encoder outputs should be freed from cache.
+
+        Args:
+            request: The request whose encoder outputs should be freed
+        """
        input_ids = self.get_cached_input_ids(request).copy()
        for input_id in input_ids:
            self.free_encoder_input(request, input_id)

    def get_freed_ids(self) -> list[tuple[str, int]]:
+        """Get and clear the list of recently freed encoder cache entries.
+
+        This method returns all encoder cache entries that were freed since
+        the last call to this method. It's used by the scheduler to notify
+        workers about which encoder outputs can be removed from their caches.
+
+        Returns:
+            List of (request_id, input_id) tuples that were freed since the
+            last call. The internal freed list is cleared after this call.
+        """
        freed = self.freed
        self.freed = []
        return freed