[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)

Signed-off-by: NickLucche <nlucches@redhat.com>

[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by: NickLucche <nlucches@redhat.com>
ea6102b8 · Nicolò Lucchesi · GitHub · 328cbb27 · ea6102b8 · ea6102b8
Unverified Commit ea6102b8 authored Jan 22, 2026 by Nicolò Lucchesi Committed by GitHub Jan 22, 2026
Showing with 54 additions and 5 deletions

tests/models/multimodal/generation/test_whisper.py tests/models/multimodal/generation/test_whisper.py +43 -0

vllm/v1/core/encoder_cache_manager.py vllm/v1/core/encoder_cache_manager.py +11 -5

No files found.
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -176,3 +176,46 @@ def test_models_distributed(
        distributed_executor_backend=distributed_executor_backend,
        enforce_eager=False,
    )
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+def test_encoder_cache_cleanup(
+    vllm_runner,
+    model: str,
+    input_audios,
+    monkeypatch,
+) -> None:
+    """Test that encoder cache is properly cleaned up after requests complete.
+    This is a regression test for a bug where encoder cache entries were freed
+    in the same scheduling step they were allocated, before the model could use
+    them.
+    """
+    # Set single-process mode to access the model runner's encoder cache directly
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    check_model_available(model)
+    with vllm_runner(
+        model,
+        dtype="half",
+        max_model_len=448,
+        tensor_parallel_size=1,
+        limit_mm_per_prompt={"audio": 2},
+        enforce_eager=True,
+    ) as vllm_model:
+        engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
+        model_runner = engine_core.model_executor.driver_worker.worker.model_runner
+        encoder_cache = model_runner.encoder_cache
+        # Run multiple sequential requests to ensure cache is properly managed
+        for vllm_prompts, _, audios in input_audios:
+            vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
+        # After all requests complete, encoder cache should be empty
+        cache_size = len(encoder_cache)
+        assert cache_size == 0, (
+            f"Encoder cache should be empty after all requests complete, "
+            f"but has {cache_size} entries. This indicates encoder cache "
+            f"entries are not being properly freed."
+        )
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
    def __init__(self, cache_size: int):
        self.cache_size = cache_size
        self.num_free_slots = cache_size
-        self.freed: list[str] = []
+        self.allocated: list[str] = []
+        self.to_free: list[str] = []
    def check_and_update_cache(self, request: Request, input_id: int) -> bool:
        return False
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
        self.num_free_slots -= num_encoder_embeds
        mm_hash = request.mm_features[input_id].identifier
-        self.freed.append(mm_hash)
+        self.allocated.append(mm_hash)
    def free(self, request: Request) -> None:
        for input_id in range(len(request.mm_features)):
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
        return set(range(len(request.mm_features)))
    def get_freed_mm_hashes(self) -> list[str]:
-        freed = self.freed
+        # As encoder cache is not used for enc-dec models, we can free the entries here
-        self.freed = []
+        # The actual free happens in the runner, *before* the model is executed.
-        return freed
+        # Therefore, `freeable` acts as a buffer to free the entries only after the
+        # model is executed, mimicking the state transition of `EncoderCacheManager`.
+        to_free = self.to_free
+        self.to_free = self.allocated
+        self.allocated = []
+        return to_free
    def free_encoder_input(self, request: Request, input_id: int) -> None:
        num_encoder_embeds = request.get_num_encoder_embeds(input_id)