Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dd5ede44
Unverified
Commit
dd5ede44
authored
Feb 13, 2025
by
Roger Wang
Committed by
GitHub
Feb 13, 2025
Browse files
[V1] Consolidate MM cache size to vllm.envs (#13239)
parent
8c32b08a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
18 additions
and
11 deletions
+18
-11
vllm/envs.py
vllm/envs.py
+9
-2
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-4
vllm/v1/engine/mm_input_cache.py
vllm/v1/engine/mm_input_cache.py
+7
-5
No files found.
vllm/envs.py
View file @
dd5ede44
...
...
@@ -55,6 +55,7 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_VIDEO_FETCH_TIMEOUT
:
int
=
30
VLLM_AUDIO_FETCH_TIMEOUT
:
int
=
10
VLLM_MM_INPUT_CACHE_SIZE
:
int
=
256
VLLM_TARGET_DEVICE
:
str
=
"cuda"
MAX_JOBS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
...
...
@@ -401,15 +402,21 @@ environment_variables: Dict[str, Callable[[], Any]] = {
lambda
:
int
(
os
.
getenv
(
"VLLM_IMAGE_FETCH_TIMEOUT"
,
"5"
)),
# Timeout for fetching videos when serving multimodal models
# Default is
15
seconds
# Default is
30
seconds
"VLLM_VIDEO_FETCH_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_VIDEO_FETCH_TIMEOUT"
,
"
15
"
)),
lambda
:
int
(
os
.
getenv
(
"VLLM_VIDEO_FETCH_TIMEOUT"
,
"
30
"
)),
# Timeout for fetching audio when serving multimodal models
# Default is 10 seconds
"VLLM_AUDIO_FETCH_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_AUDIO_FETCH_TIMEOUT"
,
"10"
)),
# Cache size for multimodal feature/input cache for multimodal models
# in unit of number of multimodal data items (e.g. image, video, audio).
# Default is 256 multimodal data items.
"VLLM_MM_INPUT_CACHE_SIZE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_MM_INPUT_CACHE_SIZE"
,
"256"
)),
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH"
:
...
...
vllm/multimodal/registry.py
View file @
dd5ede44
...
...
@@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
import
torch.nn
as
nn
from
vllm.envs
import
VLLM_MM_INPUT_CACHE_SIZE
from
vllm.inputs
import
InputProcessingContext
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
...
...
@@ -28,9 +29,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
# TODO: Tune the MM cache size
MM_CACHE_SIZE
=
256
N
=
TypeVar
(
"N"
,
bound
=
Type
[
nn
.
Module
])
_I
=
TypeVar
(
"_I"
,
bound
=
BaseProcessingInfo
)
_I_co
=
TypeVar
(
"_I_co"
,
bound
=
BaseProcessingInfo
,
covariant
=
True
)
...
...
@@ -121,7 +119,7 @@ class MultiModalRegistry:
self
.
_limits_by_model
=
_MultiModalLimits
()
self
.
_processing_cache
=
ProcessingCache
(
MM
_CACHE_SIZE
)
self
.
_processing_cache
=
ProcessingCache
(
VLLM_MM_INPUT
_CACHE_SIZE
)
def
register_plugin
(
self
,
plugin
:
MultiModalPlugin
)
->
None
:
"""
...
...
vllm/v1/engine/mm_input_cache.py
View file @
dd5ede44
...
...
@@ -3,6 +3,7 @@
from
typing
import
Any
,
Dict
,
List
,
Optional
from
vllm.config
import
ModelConfig
from
vllm.envs
import
VLLM_MM_INPUT_CACHE_SIZE
from
vllm.logger
import
init_logger
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModalDataDict
,
MultiModalKwargs
,
MultiModalRegistry
)
...
...
@@ -28,9 +29,8 @@ logger = init_logger(__name__)
# client (=P0) and server (=P1) processes.
# Both Client and Server must use the same cache size
# (to perform mirrored caching)
# TODO: Tune the MM cache size
MM_CACHE_SIZE
=
256
# (to perform mirrored caching). This cache size is set by the environment
# variable VLLM_MM_INPUT_CACHE_SIZE.
# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
...
...
@@ -50,7 +50,8 @@ class MMInputCacheClient:
# Init cache
self
.
use_cache
=
not
model_config
.
disable_mm_preprocessor_cache
self
.
mm_cache
=
LRUCache
[
str
,
MultiModalKwargs
](
MM_CACHE_SIZE
)
self
.
mm_cache
=
LRUCache
[
str
,
MultiModalKwargs
](
VLLM_MM_INPUT_CACHE_SIZE
)
# DEBUG: Set to None to disable
self
.
mm_debug_cache_hit_ratio_steps
=
None
...
...
@@ -127,7 +128,8 @@ class MMInputCacheServer:
def
__init__
(
self
,
model_config
):
self
.
use_cache
=
not
model_config
.
disable_mm_preprocessor_cache
self
.
mm_cache
=
LRUCache
[
str
,
MultiModalKwargs
](
MM_CACHE_SIZE
)
self
.
mm_cache
=
LRUCache
[
str
,
MultiModalKwargs
](
VLLM_MM_INPUT_CACHE_SIZE
)
def
get_and_update
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment