feat(frontend): forward multimodal URLs from Python preprocessors to backend (#7837)

Signed-off-by: Neal Vaidya <nealv@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

feat(frontend): forward multimodal URLs from Python preprocessors to backend (#7837)
Signed-off-by: Neal Vaidya <nealv@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
23144df5 · Neal Vaidya · GitHub · 63915ef5 · 23144df5 · 23144df5
Unverified Commit 23144df5 authored Apr 03, 2026 by Neal Vaidya Committed by GitHub Apr 03, 2026
4 changed files
--- a/components/src/dynamo/frontend/sglang_processor.py
+++ b/components/src/dynamo/frontend/sglang_processor.py
@@ -35,7 +35,7 @@ from .sglang_prepost import (
    create_parsers,
    preprocess_chat_request,
 )
-from .utils import PreprocessError, random_uuid, worker_warmup
+from .utils import PreprocessError, extract_mm_urls, random_uuid, worker_warmup
 logger = logging.getLogger(__name__)
@@ -173,7 +173,7 @@ def _build_dynamo_preproc(
    elif top_logprobs not in (None, 0):
        logprobs_val = top_logprobs
-    return {
+    preproc = {
        "model": model_name,
        "token_ids": prompt_token_ids,
        "stop_conditions": {
@@ -204,6 +204,13 @@ def _build_dynamo_preproc(
        "annotations": [],
    }
+    # Forward multimodal URLs so the backend handler can load the media.
+    mm_data = extract_mm_urls(request.get("messages", []))
+    if mm_data:
+        preproc["multi_modal_data"] = mm_data
+    return preproc
 class SglangProcessor:
    def __init__(
@@ -403,6 +410,7 @@ class SglangProcessor:
                    stop_conditions=dynamo_preproc["stop_conditions"],
                    sampling_options=dynamo_preproc["sampling_options"],
                    output_options=dynamo_preproc["output_options"],
+                    multi_modal_data=dynamo_preproc.get("multi_modal_data"),
                )
            else:
                dynamo_stream = await self.router.generate(

--- a/components/src/dynamo/frontend/tests/test_multimodal_utils.py
+++ b/components/src/dynamo/frontend/tests/test_multimodal_utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from dynamo.frontend.utils import extract_mm_urls
+pytestmark = [
+    pytest.mark.unit,
+    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
+]
+def test_returns_none_for_text_only():
+    messages = [
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+    assert extract_mm_urls(messages) is None
+def test_returns_none_for_empty_messages():
+    assert extract_mm_urls([]) is None
+def test_extracts_image_urls():
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/cat.png"},
+                },
+                {"type": "text", "text": "What is this?"},
+            ],
+        }
+    ]
+    result = extract_mm_urls(messages)
+    assert result == {"image_url": [{"Url": "https://example.com/cat.png"}]}
+def test_extracts_audio_urls():
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": "data:audio/wav;base64,UklGRg=="},
+                },
+                {"type": "text", "text": "What sound is this?"},
+            ],
+        }
+    ]
+    result = extract_mm_urls(messages)
+    assert result == {"audio_url": [{"Url": "data:audio/wav;base64,UklGRg=="}]}
+def test_extracts_video_urls():
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video_url",
+                    "video_url": {"url": "https://example.com/clip.mp4"},
+                },
+            ],
+        }
+    ]
+    result = extract_mm_urls(messages)
+    assert result == {"video_url": [{"Url": "https://example.com/clip.mp4"}]}
+def test_extracts_mixed_modalities():
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/img.jpg"},
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": "https://example.com/audio.wav"},
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {"url": "https://example.com/video.mp4"},
+                },
+                {"type": "text", "text": "Describe all of these."},
+            ],
+        }
+    ]
+    result = extract_mm_urls(messages)
+    assert result == {
+        "image_url": [{"Url": "https://example.com/img.jpg"}],
+        "audio_url": [{"Url": "https://example.com/audio.wav"}],
+        "video_url": [{"Url": "https://example.com/video.mp4"}],
+    }
+def test_extracts_multiple_items_per_modality():
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/a.png"},
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/b.png"},
+                },
+                {"type": "text", "text": "Compare these images."},
+            ],
+        }
+    ]
+    result = extract_mm_urls(messages)
+    assert result == {
+        "image_url": [
+            {"Url": "https://example.com/a.png"},
+            {"Url": "https://example.com/b.png"},
+        ]
+    }
+def test_ignores_non_user_messages():
+    messages = [
+        {"role": "system", "content": "You are helpful."},
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/fake.png"},
+                },
+            ],
+        },
+        {"role": "user", "content": "Hello"},
+    ]
+    assert extract_mm_urls(messages) is None
+def test_handles_malformed_content_non_dict():
+    """Non-dict items in content list should be skipped, not crash."""
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                "a plain string instead of a dict",
+                42,
+                None,
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/ok.png"},
+                },
+            ],
+        }
+    ]
+    result = extract_mm_urls(messages)
+    assert result == {"image_url": [{"Url": "https://example.com/ok.png"}]}
--- a/components/src/dynamo/frontend/utils.py
+++ b/components/src/dynamo/frontend/utils.py
@@ -30,3 +30,48 @@ class PreprocessError(Exception):
    def __init__(self, error_dict: dict[str, Any]):
        self.error_dict = error_dict
        super().__init__(str(error_dict))
+# Content part types that carry media URLs, mapped to the key used in the
+# multimodal data dict sent to the backend handler.
+_MEDIA_CONTENT_TYPES = ("image_url", "audio_url", "video_url")
+def extract_mm_urls(
+    messages: list[dict[str, Any]],
+) -> dict[str, list[dict[str, str]]] | None:
+    """Extract multimodal URLs from OpenAI chat completion messages.
+    Walks user message content arrays and collects ``image_url``, ``audio_url``,
+    and ``video_url`` entries.  Returns them in the format expected by the
+    backend handler's ``_extract_multimodal_data()``::
+        {
+            "image_url": [{"Url": "https://..."}, ...],
+            "audio_url": [{"Url": "data:audio/wav;base64,..."}],
+        }
+    Returns ``None`` if no multimodal content is found.
+    """
+    mm_data: dict[str, list[dict[str, str]]] = {}
+    for msg in messages:
+        if not isinstance(msg, dict) or msg.get("role") != "user":
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        for part in content:
+            if not isinstance(part, dict):
+                continue
+            part_type = part.get("type")
+            if part_type not in _MEDIA_CONTENT_TYPES:
+                continue
+            media_value = part.get(part_type)
+            if not isinstance(media_value, dict):
+                continue
+            url = media_value.get("url")
+            if isinstance(url, str) and url:
+                mm_data.setdefault(part_type, []).append({"Url": url})
+    return mm_data or None
--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -37,7 +37,7 @@ from dynamo.llm import (
 from dynamo.runtime import Client, DistributedRuntime
 from .prepost import StreamingPostProcessor, preprocess_chat_request
-from .utils import random_uuid
+from .utils import extract_mm_urls, random_uuid
 logger = logging.getLogger(__name__)
@@ -121,6 +121,23 @@ class VllmProcessor:
    ) -> AsyncGenerator[dict[str, Any], None]:
        request_id = random_uuid()
+        # vLLM's Pydantic model requires image_url.detail to be 'auto'/'low'/'high'.
+        # The Rust HTTP layer accepts None/missing, so normalize before validation.
+        messages = request.get("messages") or []
+        for msg in messages:
+            if not isinstance(msg, dict):
+                continue
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for part in content:
+                if not isinstance(part, dict):
+                    continue
+                if part.get("type") == "image_url":
+                    img_url = part.get("image_url")
+                    if isinstance(img_url, dict) and img_url.get("detail") is None:
+                        img_url["detail"] = "auto"
        pre = await preprocess_chat_request(
            request,
            tokenizer=self.tokenizer,
@@ -251,6 +268,11 @@ class VllmProcessor:
            "annotations": [],
        }
+        # Forward multimodal URLs so the backend handler can load the media.
+        mm_data = extract_mm_urls(request.get("messages") or [])
+        if mm_data:
+            dynamo_preproc["multi_modal_data"] = mm_data
        post = StreamingPostProcessor(
            tokenizer=self.tokenizer,
            request_for_sampling=request_for_sampling,
@@ -290,6 +312,7 @@ class VllmProcessor:
                    stop_conditions=dynamo_preproc["stop_conditions"],
                    sampling_options=dynamo_preproc["sampling_options"],
                    output_options=dynamo_preproc["output_options"],
+                    multi_modal_data=dynamo_preproc.get("multi_modal_data"),
                )
            else:
                dynamo_stream = await self.router.generate(