feat: Add security flag to MM flow in vllm (#4556)

Co-authored-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>

feat: Add security flag to MM flow in vllm (#4556)
Co-authored-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
550bf98c · Indrajit Bhosale · GitHub · ef8cf365 · 550bf98c · 550bf98c
Unverified Commit 550bf98c authored Nov 24, 2025 by Indrajit Bhosale Committed by GitHub Nov 25, 2025
9 changed files
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -60,6 +60,7 @@ class Config:
    multimodal_encode_worker: bool = False
    multimodal_worker: bool = False
    multimodal_decode_worker: bool = False
+    enable_multimodal: bool = False
    multimodal_encode_prefill_worker: bool = False
    mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
    # dump config to file
@@ -159,6 +160,11 @@ def parse_args() -> Config:
        action="store_true",
        help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4)",
    )
+    parser.add_argument(
+        "--enable-multimodal",
+        action="store_true",
+        help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
+    )
    parser.add_argument(
        "--mm-prompt-template",
        type=str,
@@ -224,6 +230,9 @@ def parse_args() -> Config:
            "Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
        )

+    if mm_flags == 1 and not args.enable_multimodal:
+        raise ValueError("Use --enable-multimodal to enable multimodal processing")
+
    # Set component and endpoint based on worker type
    if args.multimodal_processor:
        config.component = "processor"
@@ -262,6 +271,7 @@ def parse_args() -> Config:
    config.multimodal_worker = args.multimodal_worker
    config.multimodal_decode_worker = args.multimodal_decode_worker
    config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
+    config.enable_multimodal = args.enable_multimodal
    config.mm_prompt_template = args.mm_prompt_template
    config.store_kv = args.store_kv
    config.request_plane = args.request_plane

--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -85,6 +85,7 @@ class BaseWorkerHandler(ABC):
        engine,
        default_sampling_params,
        model_max_len: int | None = None,
+        enable_multimodal: bool = False,
    ):
        self.runtime = runtime
        self.component = component
@@ -95,6 +96,7 @@ class BaseWorkerHandler(ABC):
        self.image_loader = ImageLoader()
        self.temp_dirs: list[tempfile.TemporaryDirectory] = []
        self.model_max_len = model_max_len
+        self.enable_multimodal = enable_multimodal

    @abstractmethod
    async def generate(self, request, context) -> AsyncGenerator[dict, None]:
@@ -159,6 +161,13 @@ class BaseWorkerHandler(ABC):
        if "multi_modal_data" not in request or request["multi_modal_data"] is None:
            return None

+        # Security check: reject multimodal data if not explicitly enabled
+        if not self.enable_multimodal:
+            raise ValueError(
+                "Received multimodal data but multimodal processing is not enabled. "
+                "Use --enable-multimodal flag to enable multimodal processing."
+            )
+
        mm_map = request["multi_modal_data"]
        vllm_mm_data = {}

@@ -271,9 +280,15 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        engine,
        default_sampling_params,
        model_max_len: int | None = None,
+        enable_multimodal: bool = False,
    ):
        super().__init__(
-            runtime, component, engine, default_sampling_params, model_max_len
+            runtime,
+            component,
+            engine,
+            default_sampling_params,
+            model_max_len,
+            enable_multimodal,
        )

    async def generate(self, request, context):
@@ -339,9 +354,15 @@ class PrefillWorkerHandler(BaseWorkerHandler):
        engine,
        default_sampling_params,
        model_max_len: int | None = None,
+        enable_multimodal: bool = False,
    ):
        super().__init__(
-            runtime, component, engine, default_sampling_params, model_max_len
+            runtime,
+            component,
+            engine,
+            default_sampling_params,
+            model_max_len,
+            enable_multimodal,
        )

    async def generate(self, request, context):

--- a/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
+++ b/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
@@ -38,7 +38,13 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
        )

        # Call BaseWorkerHandler.__init__ with proper parameters
-        super().__init__(runtime, component, engine_client, default_sampling_params)
+        super().__init__(
+            runtime,
+            component,
+            engine_client,
+            default_sampling_params,
+            enable_multimodal=config.enable_multimodal,
+        )

        self.config = config
        self.enable_disagg = config.is_prefill_worker
@@ -98,7 +104,13 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
        )

        # Call BaseWorkerHandler.__init__ with proper parameters
-        super().__init__(runtime, component, engine_client, default_sampling_params)
+        super().__init__(
+            runtime,
+            component,
+            engine_client,
+            default_sampling_params,
+            enable_multimodal=config.enable_multimodal,
+        )

        self.config = config
        self.decode_worker_client = decode_worker_client

--- a/docs/backends/vllm/multimodal.md
+++ b/docs/backends/vllm/multimodal.md
@@ -22,6 +22,10 @@ Dynamo supports multimodal models with vLLM v1. In general, multimodal models ca
 > [!WARNING]
 > **LLaVA Model Limitation**: Do not use LLaVA models (e.g., `llava-hf/llava-1.5-7b-hf`) with the standard aggregated serving setup, as they contain keywords that Dynamo cannot yet parse. LLaVA models can still be used with the EPD (Encode-Prefill-Decode) setup described below.

+> [!IMPORTANT]
+> **Security Requirement**: All multimodal workers require the `--enable-multimodal` flag to be explicitly set at startup. This is a security feature to prevent unintended processing of multimodal data from untrusted sources. Workers will fail at startup if multimodal flags (e.g., `--multimodal-worker`, `--multimodal-processor`) are used without `--enable-multimodal`.
+This flag is analogus to `--enable-mm-embeds` in vllm serve but also extends it to all multimodal content (url, embeddings, b64).
+
 # Multimodal EPD Deployment Examples

 This section provides example workflows and reference implementations for deploying a multimodal model using Dynamo and vLLM v1 with EPD(Encode-Prefill-Decode) pipeline.

--- a/examples/backends/vllm/launch/agg_multimodal.sh
+++ b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -60,7 +60,7 @@ fi
 # --enforce-eager: Quick deployment (remove for production)
 # --connector none: No KV transfer needed for aggregated serving
 DYN_SYSTEM_PORT=8081 \
-    python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
+    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS

 # Wait for all background processes to complete
 wait

--- a/examples/backends/vllm/launch/agg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/agg_multimodal_epd.sh
@@ -75,11 +75,11 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
 fi

 # Start processor (Python-based preprocessing, handles prompt templating)
-python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
+python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &

 # run E/P/D workers
-CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME &
-CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS &
+CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
+CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &

 # Wait for all background processes to complete
 wait
--- a/examples/backends/vllm/launch/agg_multimodal_llama.sh
+++ b/examples/backends/vllm/launch/agg_multimodal_llama.sh
@@ -11,10 +11,10 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 python -m dynamo.frontend --http-port=8000 &

 # run processor
-python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
+python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
 # Llama 4 doesn't support image embedding input, so use encode+prefill worker
 # that handles image encoding inline
-python -m dynamo.vllm --multimodal-encode-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &
+python -m dynamo.vllm --multimodal-encode-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &

 # Wait for all background processes to complete
 wait
--- a/examples/backends/vllm/launch/disagg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_epd.sh
@@ -76,7 +76,7 @@ python -m dynamo.frontend --http-port=8000 &

 # Start processor
 echo "Starting processor..."
-python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
+python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &

 # Configure GPU memory optimization for specific models
 EXTRA_ARGS=""
@@ -86,17 +86,17 @@ fi

 # Start encode worker
 echo "Starting encode worker on GPU 1..."
-VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME  $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME  $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &

 # Start prefill worker
 echo "Starting prefill worker on GPU 2..."
 VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
-CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
+CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &

 # Start decode worker
 echo "Starting decode worker on GPU 3..."
 VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
-CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
+CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &

 echo "=================================================="
 echo "All components started. Waiting for initialization..."

--- a/examples/backends/vllm/launch/disagg_multimodal_llama.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_llama.sh
@@ -48,15 +48,15 @@ if [[ $HEAD_NODE -eq 1 ]]; then
    python -m dynamo.frontend --http-port=8000 &

    # run processor
-    python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
+    python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &

    # Llama 4 doesn't support image embedding input, so the prefill worker will also
    # handle image encoding inline.
    # run prefill worker
-    VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
+    VLLM_NIXL_SIDE_CHANNEL_PORT=20097 python -m dynamo.vllm --multimodal-encode-prefill-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
 else
    # run decode worker on non-head node
-    VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
+    VLLM_NIXL_SIDE_CHANNEL_PORT=20098 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
 fi

 # Wait for all background processes to complete