[MM] Allow skipping memory profiling for multimodal models. (#22950)

Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

[MM] Allow skipping memory profiling for multimodal models. (#22950)
Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
49252cf5 · Roger Wang · GitHub · 3e6dd400 · 49252cf5 · 49252cf5
Unverified Commit 49252cf5 authored Aug 15, 2025 by Roger Wang Committed by GitHub Aug 15, 2025
4 changed files
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -388,6 +388,10 @@ class ModelConfig:
    interleave_mm_strings: bool = False
    """Enable fully interleaved support for multimodal prompts, while using
    --chat-template-content-format=string. Defaults to False."""
+    skip_mm_profiling: bool = False
+    """When enabled, skips multimodal memory profiling and only profiles with
+    language backbone model during engine initialization.
+    """
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    """Additional args passed to process media inputs, keyed by modalities.
    For example, to set num_frames for video, set
@@ -837,7 +841,8 @@ class ModelConfig:
                media_io_kwargs=self.media_io_kwargs,
                mm_processor_kwargs=self.mm_processor_kwargs,
                mm_processor_cache_gb=self.mm_processor_cache_gb,
-                interleave_mm_strings=self.interleave_mm_strings)
+                interleave_mm_strings=self.interleave_mm_strings,
+                skip_mm_profiling=self.skip_mm_profiling)

        return None

@@ -2511,6 +2516,16 @@ class MultiModalConfig:
    Enable fully interleaved support for multimodal prompts.
    """

+    skip_mm_profiling: bool = False
+    """
+    When enabled, skips multimodal memory profiling and only profiles with 
+    language backbone model during engine initialization.
+
+    This reduces engine startup time but shifts the responsibility to users for
+    estimating the peak memory usage of the activation of multimodal encoder and
+    embedding cache.
+    """
+
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -350,6 +350,7 @@ class EngineArgs:
        MultiModalConfig.mm_processor_kwargs
    disable_mm_preprocessor_cache: bool = False  # DEPRECATED
    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
    # LoRA fields
    enable_lora: bool = False
    enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -716,6 +717,8 @@ class EngineArgs:
        multimodal_group.add_argument(
            "--interleave-mm-strings",
            **multimodal_kwargs["interleave_mm_strings"])
+        multimodal_group.add_argument("--skip-mm-profiling",
+                                      **multimodal_kwargs["skip_mm_profiling"])

        # LoRA related configs
        lora_kwargs = get_kwargs(LoRAConfig)
@@ -918,6 +921,7 @@ class EngineArgs:
            limit_mm_per_prompt=self.limit_mm_per_prompt,
            interleave_mm_strings=self.interleave_mm_strings,
            media_io_kwargs=self.media_io_kwargs,
+            skip_mm_profiling=self.skip_mm_profiling,
            use_async_output_proc=not self.disable_async_output_proc,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2479,6 +2479,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
    def profile_run(self) -> None:
        # Profile with multimodal encoder & encoder cache.
        if self.supports_mm_inputs:
+            if self.model_config.multimodal_config.skip_mm_profiling:
+                logger.info(
+                    "Skipping memory profiling for multimodal encoder and "
+                    "encoder cache.")
+            else:
                mm_budget = self.mm_budget
                assert mm_budget is not None

@@ -2498,8 +2503,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

                    logger.info(
                        "Encoder cache will be initialized with a budget of "
-                    "%s tokens, and profiled with %s %s items of the maximum "
-                    "feature size.",
+                        "%s tokens, and profiled with %s %s items of the "
+                        "maximum feature size.",
                        encoder_budget,
                        max_mm_items_per_batch,
                        dummy_modality,
@@ -2512,7 +2517,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    )

                    # Run multimodal encoder.
-                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    dummy_encoder_outputs = \
+                        self.model.get_multimodal_embeddings(
                        **batched_dummy_mm_inputs)

                    sanity_check_mm_encoder_outputs(

--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1529,6 +1529,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
    ) -> None:
        # Profile with multimodal encoder & encoder cache.
        if self.supports_mm_inputs:
+            if self.model_config.multimodal_config.skip_mm_profiling:
+                logger.info(
+                    "Skipping memory profiling for multimodal encoder and "
+                    "encoder cache.")
+            else:
                mm_budget = self.mm_budget
                assert mm_budget is not None

@@ -1548,8 +1553,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

                    logger.info(
                        "Encoder cache will be initialized with a budget of "
-                    "%s tokens, and profiled with %s %s items of the maximum "
-                    "feature size.",
+                        "%s tokens, and profiled with %s %s items of the "
+                        "maximum feature size.",
                        encoder_budget,
                        max_mm_items_per_batch,
                        dummy_modality,
@@ -1566,13 +1571,14 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    # impact of recompilation until it's fixed.
                    start = time.perf_counter()
                    xm.mark_step()
-                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    dummy_encoder_outputs = \
+                        self.model.get_multimodal_embeddings(
                        **batched_dummy_mm_inputs)
                    xm.mark_step()
                    xm.wait_device_ops()
                    end = time.perf_counter()
                    logger.info(
-                    "Multimodal Encoder profiling finished in in %.2f [secs].",
+                        "Multimodal Encoder profiling finished in %.2f [secs].",
                        end - start)

                    sanity_check_mm_encoder_outputs(