Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ea6102b8
Unverified
Commit
ea6102b8
authored
Jan 22, 2026
by
Nicolò Lucchesi
Committed by
GitHub
Jan 22, 2026
Browse files
[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by:
NickLucche
<
nlucches@redhat.com
>
parent
328cbb27
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
54 additions
and
5 deletions
+54
-5
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/test_whisper.py
+43
-0
vllm/v1/core/encoder_cache_manager.py
vllm/v1/core/encoder_cache_manager.py
+11
-5
No files found.
tests/models/multimodal/generation/test_whisper.py
View file @
ea6102b8
...
@@ -176,3 +176,46 @@ def test_models_distributed(
...
@@ -176,3 +176,46 @@ def test_models_distributed(
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
False
,
enforce_eager
=
False
,
)
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"openai/whisper-large-v3-turbo"
])
def
test_encoder_cache_cleanup
(
vllm_runner
,
model
:
str
,
input_audios
,
monkeypatch
,
)
->
None
:
"""Test that encoder cache is properly cleaned up after requests complete.
This is a regression test for a bug where encoder cache entries were freed
in the same scheduling step they were allocated, before the model could use
them.
"""
# Set single-process mode to access the model runner's encoder cache directly
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
check_model_available
(
model
)
with
vllm_runner
(
model
,
dtype
=
"half"
,
max_model_len
=
448
,
tensor_parallel_size
=
1
,
limit_mm_per_prompt
=
{
"audio"
:
2
},
enforce_eager
=
True
,
)
as
vllm_model
:
engine_core
=
vllm_model
.
llm
.
llm_engine
.
engine_core
.
engine_core
model_runner
=
engine_core
.
model_executor
.
driver_worker
.
worker
.
model_runner
encoder_cache
=
model_runner
.
encoder_cache
# Run multiple sequential requests to ensure cache is properly managed
for
vllm_prompts
,
_
,
audios
in
input_audios
:
vllm_model
.
generate_greedy
(
vllm_prompts
,
max_tokens
=
50
,
audios
=
audios
)
# After all requests complete, encoder cache should be empty
cache_size
=
len
(
encoder_cache
)
assert
cache_size
==
0
,
(
f
"Encoder cache should be empty after all requests complete, "
f
"but has
{
cache_size
}
entries. This indicates encoder cache "
f
"entries are not being properly freed."
)
vllm/v1/core/encoder_cache_manager.py
View file @
ea6102b8
...
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
...
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
def
__init__
(
self
,
cache_size
:
int
):
def
__init__
(
self
,
cache_size
:
int
):
self
.
cache_size
=
cache_size
self
.
cache_size
=
cache_size
self
.
num_free_slots
=
cache_size
self
.
num_free_slots
=
cache_size
self
.
freed
:
list
[
str
]
=
[]
self
.
allocated
:
list
[
str
]
=
[]
self
.
to_free
:
list
[
str
]
=
[]
def
check_and_update_cache
(
self
,
request
:
Request
,
input_id
:
int
)
->
bool
:
def
check_and_update_cache
(
self
,
request
:
Request
,
input_id
:
int
)
->
bool
:
return
False
return
False
...
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
...
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
self
.
num_free_slots
-=
num_encoder_embeds
self
.
num_free_slots
-=
num_encoder_embeds
mm_hash
=
request
.
mm_features
[
input_id
].
identifier
mm_hash
=
request
.
mm_features
[
input_id
].
identifier
self
.
fre
ed
.
append
(
mm_hash
)
self
.
allocat
ed
.
append
(
mm_hash
)
def
free
(
self
,
request
:
Request
)
->
None
:
def
free
(
self
,
request
:
Request
)
->
None
:
for
input_id
in
range
(
len
(
request
.
mm_features
)):
for
input_id
in
range
(
len
(
request
.
mm_features
)):
...
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
...
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
return
set
(
range
(
len
(
request
.
mm_features
)))
return
set
(
range
(
len
(
request
.
mm_features
)))
def
get_freed_mm_hashes
(
self
)
->
list
[
str
]:
def
get_freed_mm_hashes
(
self
)
->
list
[
str
]:
freed
=
self
.
freed
# As encoder cache is not used for enc-dec models, we can free the entries here
self
.
freed
=
[]
# The actual free happens in the runner, *before* the model is executed.
return
freed
# Therefore, `freeable` acts as a buffer to free the entries only after the
# model is executed, mimicking the state transition of `EncoderCacheManager`.
to_free
=
self
.
to_free
self
.
to_free
=
self
.
allocated
self
.
allocated
=
[]
return
to_free
def
free_encoder_input
(
self
,
request
:
Request
,
input_id
:
int
)
->
None
:
def
free_encoder_input
(
self
,
request
:
Request
,
input_id
:
int
)
->
None
:
num_encoder_embeds
=
request
.
get_num_encoder_embeds
(
input_id
)
num_encoder_embeds
=
request
.
get_num_encoder_embeds
(
input_id
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment