Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
79b6ec6a
Unverified
Commit
79b6ec6a
authored
Feb 01, 2026
by
Cyrus Leung
Committed by
GitHub
Jan 31, 2026
Browse files
[Bugfix] Fix inconsistent handling of cache reset (#33481)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
d6416fdd
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
35 additions
and
11 deletions
+35
-11
docs/benchmarking/sweeps.md
docs/benchmarking/sweeps.md
+1
-1
vllm/benchmarks/sweep/server.py
vllm/benchmarks/sweep/server.py
+9
-5
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/engine/serving.py
+0
-4
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+1
-0
vllm/v1/worker/gpu/mm/encoder_runner.py
vllm/v1/worker/gpu/mm/encoder_runner.py
+16
-0
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+4
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+4
-0
No files found.
docs/benchmarking/sweeps.md
View file @
79b6ec6a
...
...
@@ -82,7 +82,7 @@ vllm bench sweep serve \
You can use
`--dry-run`
to preview the commands to be run.
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
Between each benchmark run, we call
the `/reset_prefix_cache` and
`/reset_
mm
_cache` endpoints to get a clean slate for the next run.
Between each benchmark run, we call
all
`/reset_
*
_cache` endpoints to get a clean slate for the next run.
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
!!! note
...
...
vllm/benchmarks/sweep/server.py
View file @
79b6ec6a
...
...
@@ -12,6 +12,12 @@ from typing_extensions import Self
class
ServerProcess
:
VLLM_RESET_CACHE_ENDPOINTS
=
[
"/reset_prefix_cache"
,
"/reset_mm_cache"
,
"/reset_encoder_cache"
,
]
def
__init__
(
self
,
server_cmd
:
list
[
str
],
...
...
@@ -120,11 +126,9 @@ class ServerProcess:
server_address
=
self
.
_get_vllm_server_address
()
print
(
f
"Resetting caches at
{
server_address
}
"
)
res
=
requests
.
post
(
f
"
{
server_address
}
/reset_prefix_cache"
)
res
.
raise_for_status
()
res
=
requests
.
post
(
f
"
{
server_address
}
/reset_mm_cache"
)
res
.
raise_for_status
()
for
endpoint
in
self
.
VLLM_RESET_CACHE_ENDPOINTS
:
res
=
requests
.
post
(
server_address
+
endpoint
)
res
.
raise_for_status
()
elif
server_cmd
[
0
].
endswith
(
"infinity_emb"
):
if
"--vector-disk-cache"
in
server_cmd
:
raise
NotImplementedError
(
...
...
vllm/entrypoints/openai/engine/serving.py
View file @
79b6ec6a
...
...
@@ -286,10 +286,6 @@ class OpenAIServing:
raise
TypeError
(
f
"
{
reasoning_parser_name
=
}
has not been registered"
)
from
e
return
parser
async
def
reset_mm_cache
(
self
)
->
None
:
self
.
input_processor
.
clear_mm_cache
()
await
self
.
engine_client
.
reset_mm_cache
()
async
def
beam_search
(
self
,
prompt
:
PromptType
,
...
...
vllm/v1/engine/async_llm.py
View file @
79b6ec6a
...
...
@@ -741,6 +741,7 @@ class AsyncLLM(EngineClient):
if
clear_cache
:
await
self
.
reset_prefix_cache
()
await
self
.
reset_mm_cache
()
await
self
.
reset_encoder_cache
()
async
def
resume_generation
(
self
)
->
None
:
"""Resume generation after :meth:`pause_generation`."""
...
...
vllm/v1/worker/gpu/mm/encoder_runner.py
View file @
79b6ec6a
...
...
@@ -31,6 +31,22 @@ class EncoderRunner:
self
.
req_id_to_mm_features
:
dict
[
str
,
list
[
MultiModalFeatureSpec
]]
=
{}
self
.
encoder_cache
:
dict
[
str
,
torch
.
Tensor
]
=
{}
def
reset_mm_cache
(
self
)
->
None
:
"""
Clear the multi-modal cache that was used during profiling,
but no longer needed during inference.
"""
# TODO: Implement MM budget for encoder dummy run
pass
def
reset_encoder_cache
(
self
)
->
None
:
"""Clear the GPU-side encoder cache storing vision embeddings.
This should be called when model weights are updated to ensure
stale embeddings computed with old weights are not reused.
"""
self
.
encoder_cache
.
clear
()
def
add_request
(
self
,
req_id
:
str
,
mm_features
:
list
[
MultiModalFeatureSpec
]):
self
.
req_id_to_mm_features
[
req_id
]
=
mm_features
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
79b6ec6a
...
...
@@ -339,7 +339,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
gc
.
collect
()
def
reset_mm_cache
(
self
)
->
None
:
pass
self
.
encoder_runner
.
reset_mm_cache
()
def
reset_encoder_cache
(
self
)
->
None
:
self
.
encoder_runner
.
reset_encoder_cache
()
def
_get_num_input_tokens
(
self
,
num_scheduled_tokens
:
int
)
->
int
:
# SP is not supported yet.
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
79b6ec6a
...
...
@@ -717,6 +717,10 @@ class GPUModelRunner(
self
.
effective_drafter_max_model_len
=
self
.
max_model_len
def
reset_mm_cache
(
self
)
->
None
:
"""
Clear the multi-modal cache that was used during profiling,
but no longer needed during inference.
"""
if
self
.
mm_budget
:
self
.
mm_budget
.
reset_cache
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment