[Misc] Move config fields to MultiModalConfig (#17343)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Misc] Move config fields to MultiModalConfig (#17343)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
ebb3930d · Cyrus Leung · GitHub · cde384cd · ebb3930d · ebb3930d
Unverified Commit ebb3930d authored Apr 29, 2025 by Cyrus Leung Committed by GitHub Apr 29, 2025
8 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -263,6 +263,10 @@ class ModelConfig:
            the model name will be the same as `model`.
        limit_mm_per_prompt: Maximum number of data items per modality
            per prompt. Only applicable for multimodal models.
+        mm_processor_kwargs: Overrides for the multi-modal processor obtained
+            from `AutoProcessor.from_pretrained`.
+        disable_mm_preprocessor_cache: If True, disable caching of the
+            processed multi-modal inputs.
        use_async_output_proc: Whether to use async output processor.
            Defaults to True.
        config_format: The config format which shall be loaded.
@@ -273,10 +277,6 @@ class ModelConfig:
        hf_overrides: If a dictionary, contains arguments to be forwarded to the
            HuggingFace config. If a callable, it is called to update the
            HuggingFace config.
-        mm_processor_kwargs: Arguments to be forwarded to the model's processor
-            for multi-modal data, e.g., image processor.
-        disable_mm_preprocessor_cache: If true, then disables caching of the
-            multi-modal preprocessor/mapper. (not recommended)
        override_neuron_config: Initialize non default neuron config or
            override default neuron config that are specific to Neuron devices,
            this argument will be used to configure the neuron config that
@@ -320,7 +320,6 @@ class ModelConfig:
        factors.append(self.max_logprobs)
        factors.append(self.disable_sliding_window)
        factors.append(self.trust_remote_code)
-        factors.append(self.mm_processor_kwargs)
        factors.append(self.generation_config)
        factors.append(self.model_impl)
        factors.append(self.override_generation_config)
@@ -359,12 +358,12 @@ class ModelConfig:
        skip_tokenizer_init: bool = False,
        served_model_name: Optional[Union[str, list[str]]] = None,
        limit_mm_per_prompt: Optional[dict[str, int]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        disable_mm_preprocessor_cache: bool = False,
        use_async_output_proc: bool = True,
        config_format: ConfigFormat = ConfigFormat.AUTO,
        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        disable_mm_preprocessor_cache: bool = False,
        override_neuron_config: Optional[dict[str, Any]] = None,
        override_pooler_config: Optional["PoolerConfig"] = None,
        logits_processor_pattern: Optional[str] = None,
@@ -469,8 +468,6 @@ class ModelConfig:
            self.model, hf_token=hf_token, revision=revision)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
        self.use_async_output_proc = use_async_output_proc
-        self.mm_processor_kwargs = mm_processor_kwargs
-        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache

        # Set enforce_eager to False if the value is unset.
        if self.enforce_eager is None:
@@ -515,7 +512,10 @@ class ModelConfig:
        self.served_model_name = get_served_model_name(model,
                                                       served_model_name)
        self.multimodal_config = self._init_multimodal_config(
-            limit_mm_per_prompt)
+            limit_mm_per_prompt=limit_mm_per_prompt,
+            mm_processor_kwargs=mm_processor_kwargs,
+            disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+        )
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()

@@ -581,14 +581,27 @@ class ModelConfig:
                self.tokenizer = s3_tokenizer.dir

    def _init_multimodal_config(
-        self, limit_mm_per_prompt: Optional[dict[str, int]]
+        self,
+        limit_mm_per_prompt: Optional[dict[str, int]],
+        mm_processor_kwargs: Optional[dict[str, Any]],
+        disable_mm_preprocessor_cache: bool,
    ) -> Optional["MultiModalConfig"]:
        if self.registry.is_multimodal_model(self.architectures):
-            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
+            return MultiModalConfig(
+                limit_per_prompt=limit_mm_per_prompt or {},
+                mm_processor_kwargs=mm_processor_kwargs or {},
+                disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+            )

        if limit_mm_per_prompt:
            raise ValueError("`limit_mm_per_prompt` is only supported for "
                             "multimodal models.")
+        if mm_processor_kwargs:
+            raise ValueError("`mm_processor_kwargs` is only supported for "
+                             "multimodal models.")
+        if disable_mm_preprocessor_cache:
+            raise ValueError("`disable_mm_preprocessor_cache` is only "
+                             "supported for multimodal models.")

        return None

@@ -2776,7 +2789,23 @@ class MultiModalConfig:
    Defaults to 1 (V0) or 999 (V1) for each modality.

    For example, to allow up to 16 images and 2 videos per prompt:
-    ``{"images": 16, "videos": 2}``
+    :code:`{"images": 16, "videos": 2}`
+    """
+
+    mm_processor_kwargs: Optional[dict[str, object]] = None
+    """
+    Overrides for the multi-modal processor obtained from
+    :meth:`transformers.AutoProcessor.from_pretrained`.
+
+    The available overrides depend on the model that is being run.
+
+    For example, for Phi-3-Vision:
+    :code:`{"num_crops": 4}`.
+    """
+
+    disable_mm_preprocessor_cache: bool = False
+    """
+    If :code:`True`, disable caching of the processed multi-modal inputs.
    """

    def compute_hash(self) -> str:
@@ -4080,8 +4109,6 @@ class VllmConfig:
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
-            f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, "  # noqa
-            f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}")


--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -672,20 +672,12 @@ class EngineArgs:
        )
        multimodal_group.add_argument('--limit-mm-per-prompt',
                                      **multimodal_kwargs["limit_per_prompt"])
-
-        parser.add_argument(
+        multimodal_group.add_argument(
            '--mm-processor-kwargs',
-            default=None,
-            type=json.loads,
-            help=('Overrides for the multi-modal processor obtained from '
-                  '``AutoProcessor.from_pretrained``. The available overrides '
-                  'depend on the model that is being run.'
-                  'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'))
-        parser.add_argument(
+            **multimodal_kwargs["mm_processor_kwargs"])
+        multimodal_group.add_argument(
            '--disable-mm-preprocessor-cache',
-            action='store_true',
-            help='If True, disable caching of the processed multi-modal '
-            'inputs.')
+            **multimodal_kwargs["disable_mm_preprocessor_cache"])

        # LoRA related configs
        lora_kwargs = get_kwargs(LoRAConfig)

--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -101,7 +101,8 @@ class InputContext:
        Initialize a HuggingFace-like processor class, merging the
        keyword arguments with those in the model's configuration.
        """
-        base_kwargs = self.model_config.mm_processor_kwargs
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
        if base_kwargs is None:
            base_kwargs = {}

@@ -139,7 +140,8 @@ class InputProcessingContext(InputContext):
        """
        assert callable(hf_processor)

-        base_kwargs = self.model_config.mm_processor_kwargs
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
        if base_kwargs is None:
            base_kwargs = {}


--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -774,8 +774,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
        size: Optional[dict[str, int]] = None,
        **kwargs: object,
    ):
-        if self.ctx.model_config.mm_processor_kwargs:
-            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
+        mm_config = self.ctx.model_config.get_multimodal_config()
+        if mm_config.mm_processor_kwargs:
+            kwargs.update(mm_config.mm_processor_kwargs)

        if min_pixels is not None:
            kwargs["min_pixels"] = min_pixels

--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -262,7 +262,8 @@ class MultiModalRegistry:
        if tokenizer is None:
            tokenizer = cached_tokenizer_from_config(model_config)
        if disable_cache is None:
-            disable_cache = model_config.disable_mm_preprocessor_cache
+            mm_config = model_config.get_multimodal_config()
+            disable_cache = mm_config.disable_mm_preprocessor_cache

        model_cls = self._get_model_cls(model_config)
        factories = self._processor_factories[model_cls]

--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -33,7 +33,8 @@ class HashableList(list):


 def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
-    base_kwargs = model_config.mm_processor_kwargs
+    mm_config = model_config.get_multimodal_config()
+    base_kwargs = mm_config.mm_processor_kwargs
    if base_kwargs is None:
        base_kwargs = {}


--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -33,7 +33,10 @@ from vllm.utils import is_list_of
 class MirroredProcessingCache:

    def __init__(self, model_config):
-        self.use_cache = not model_config.disable_mm_preprocessor_cache
+        mm_config = model_config.multimodal_config
+        disable_mm_preprocessor_cache = mm_config is not None and \
+            not mm_config.disable_mm_preprocessor_cache
+        self.use_cache = not disable_mm_preprocessor_cache
        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                      MultiModalKwargs)


--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -51,8 +51,7 @@ class Processor:
        self.mm_input_cache_client = MirroredProcessingCache(self.model_config)

        # Multi-modal hasher (for images)
-        self.use_hash = (
-            not self.model_config.disable_mm_preprocessor_cache) or \
+        self.use_hash = self.mm_input_cache_client.use_cache or \
            self.cache_config.enable_prefix_caching

    def _validate_logprobs(