Unverified Commit 10214b69 authored by Anand Roy's avatar Anand Roy Committed by GitHub
Browse files

[FEATURE]: Use pydantic validation in `multimodal.py` config (#26629)


Signed-off-by: default avatarAnand Roy <86306690+andycandy@users.noreply.github.com>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 4a61950f
...@@ -3,10 +3,9 @@ ...@@ -3,10 +3,9 @@
import hashlib import hashlib
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import field
from typing import Any, Literal, TypeAlias from typing import Any, Literal, TypeAlias
from pydantic import ConfigDict, Field, field_validator from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
from vllm.config.utils import config from vllm.config.utils import config
...@@ -55,7 +54,7 @@ DummyOptions: TypeAlias = ( ...@@ -55,7 +54,7 @@ DummyOptions: TypeAlias = (
class MultiModalConfig: class MultiModalConfig:
"""Controls the behavior of multimodal models.""" """Controls the behavior of multimodal models."""
limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict) limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
"""The maximum number of input items and options allowed per """The maximum number of input items and options allowed per
prompt for each modality. prompt for each modality.
Defaults to 999 for each modality. Defaults to 999 for each modality.
...@@ -71,7 +70,7 @@ class MultiModalConfig: ...@@ -71,7 +70,7 @@ class MultiModalConfig:
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
"height": 512}} "height": 512}}
""" """
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities. """Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`""" `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
...@@ -84,7 +83,7 @@ class MultiModalConfig: ...@@ -84,7 +83,7 @@ class MultiModalConfig:
For example, for Phi-3-Vision: For example, for Phi-3-Vision:
`{"num_crops": 4}`.""" `{"num_crops": 4}`."""
mm_processor_cache_gb: float = 4 mm_processor_cache_gb: float = Field(default=4, ge=0)
"""The size (in GiB) of the multi-modal processor cache, which is used to """The size (in GiB) of the multi-modal processor cache, which is used to
avoid re-processing past multi-modal inputs. avoid re-processing past multi-modal inputs.
...@@ -96,7 +95,7 @@ class MultiModalConfig: ...@@ -96,7 +95,7 @@ class MultiModalConfig:
mm_processor_cache_type: MMCacheType = "lru" mm_processor_cache_type: MMCacheType = "lru"
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
mm_shm_cache_max_object_size_mb: int = 128 mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
"""Size limit (in MiB) for each object stored in the multi-modal processor """Size limit (in MiB) for each object stored in the multi-modal processor
shared memory cache. Only effective when `mm_processor_cache_type` is shared memory cache. Only effective when `mm_processor_cache_type` is
`"shm"`.""" `"shm"`."""
...@@ -123,7 +122,7 @@ class MultiModalConfig: ...@@ -123,7 +122,7 @@ class MultiModalConfig:
This reduces engine startup time but shifts the responsibility to users for This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and estimating the peak memory usage of the activation of multimodal encoder and
embedding cache.""" embedding cache."""
video_pruning_rate: float | None = None video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
"""Sets pruning rate for video pruning via Efficient Video Sampling. """Sets pruning rate for video pruning via Efficient Video Sampling.
Value sits in range [0;1) and determines fraction of media tokens Value sits in range [0;1) and determines fraction of media tokens
from each video to be pruned. from each video to be pruned.
...@@ -149,6 +148,18 @@ class MultiModalConfig: ...@@ -149,6 +148,18 @@ class MultiModalConfig:
value[k] = BaseDummyOptions(**v) value[k] = BaseDummyOptions(**v)
return value return value
@model_validator(mode="after")
def _validate_multimodal_config(self):
if self.mm_processor_cache_type != "shm" and (
self.mm_shm_cache_max_object_size_mb
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
):
raise ValueError(
"'mm_shm_cache_max_object_size_mb' should only be set when "
"'mm_processor_cache_type' is 'shm'."
)
return self
def compute_hash(self) -> str: def compute_hash(self) -> str:
""" """
WARNING: Whenever a new field is added to this config, WARNING: Whenever a new field is added to this config,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment