Unverified Commit 23144df5 authored by Neal Vaidya's avatar Neal Vaidya Committed by GitHub
Browse files

feat(frontend): forward multimodal URLs from Python preprocessors to backend (#7837)


Signed-off-by: default avatarNeal Vaidya <nealv@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 63915ef5
...@@ -35,7 +35,7 @@ from .sglang_prepost import ( ...@@ -35,7 +35,7 @@ from .sglang_prepost import (
create_parsers, create_parsers,
preprocess_chat_request, preprocess_chat_request,
) )
from .utils import PreprocessError, random_uuid, worker_warmup from .utils import PreprocessError, extract_mm_urls, random_uuid, worker_warmup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -173,7 +173,7 @@ def _build_dynamo_preproc( ...@@ -173,7 +173,7 @@ def _build_dynamo_preproc(
elif top_logprobs not in (None, 0): elif top_logprobs not in (None, 0):
logprobs_val = top_logprobs logprobs_val = top_logprobs
return { preproc = {
"model": model_name, "model": model_name,
"token_ids": prompt_token_ids, "token_ids": prompt_token_ids,
"stop_conditions": { "stop_conditions": {
...@@ -204,6 +204,13 @@ def _build_dynamo_preproc( ...@@ -204,6 +204,13 @@ def _build_dynamo_preproc(
"annotations": [], "annotations": [],
} }
# Forward multimodal URLs so the backend handler can load the media.
mm_data = extract_mm_urls(request.get("messages", []))
if mm_data:
preproc["multi_modal_data"] = mm_data
return preproc
class SglangProcessor: class SglangProcessor:
def __init__( def __init__(
...@@ -403,6 +410,7 @@ class SglangProcessor: ...@@ -403,6 +410,7 @@ class SglangProcessor:
stop_conditions=dynamo_preproc["stop_conditions"], stop_conditions=dynamo_preproc["stop_conditions"],
sampling_options=dynamo_preproc["sampling_options"], sampling_options=dynamo_preproc["sampling_options"],
output_options=dynamo_preproc["output_options"], output_options=dynamo_preproc["output_options"],
multi_modal_data=dynamo_preproc.get("multi_modal_data"),
) )
else: else:
dynamo_stream = await self.router.generate( dynamo_stream = await self.router.generate(
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import pytest
from dynamo.frontend.utils import extract_mm_urls
pytestmark = [
pytest.mark.unit,
pytest.mark.gpu_0,
pytest.mark.pre_merge,
]
def test_returns_none_for_text_only():
messages = [
{"role": "user", "content": "What is the capital of France?"},
]
assert extract_mm_urls(messages) is None
def test_returns_none_for_empty_messages():
assert extract_mm_urls([]) is None
def test_extracts_image_urls():
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": "https://example.com/cat.png"},
},
{"type": "text", "text": "What is this?"},
],
}
]
result = extract_mm_urls(messages)
assert result == {"image_url": [{"Url": "https://example.com/cat.png"}]}
def test_extracts_audio_urls():
messages = [
{
"role": "user",
"content": [
{
"type": "audio_url",
"audio_url": {"url": "data:audio/wav;base64,UklGRg=="},
},
{"type": "text", "text": "What sound is this?"},
],
}
]
result = extract_mm_urls(messages)
assert result == {"audio_url": [{"Url": "data:audio/wav;base64,UklGRg=="}]}
def test_extracts_video_urls():
messages = [
{
"role": "user",
"content": [
{
"type": "video_url",
"video_url": {"url": "https://example.com/clip.mp4"},
},
],
}
]
result = extract_mm_urls(messages)
assert result == {"video_url": [{"Url": "https://example.com/clip.mp4"}]}
def test_extracts_mixed_modalities():
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": "https://example.com/img.jpg"},
},
{
"type": "audio_url",
"audio_url": {"url": "https://example.com/audio.wav"},
},
{
"type": "video_url",
"video_url": {"url": "https://example.com/video.mp4"},
},
{"type": "text", "text": "Describe all of these."},
],
}
]
result = extract_mm_urls(messages)
assert result == {
"image_url": [{"Url": "https://example.com/img.jpg"}],
"audio_url": [{"Url": "https://example.com/audio.wav"}],
"video_url": [{"Url": "https://example.com/video.mp4"}],
}
def test_extracts_multiple_items_per_modality():
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": "https://example.com/a.png"},
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/b.png"},
},
{"type": "text", "text": "Compare these images."},
],
}
]
result = extract_mm_urls(messages)
assert result == {
"image_url": [
{"Url": "https://example.com/a.png"},
{"Url": "https://example.com/b.png"},
]
}
def test_ignores_non_user_messages():
messages = [
{"role": "system", "content": "You are helpful."},
{
"role": "assistant",
"content": [
{
"type": "image_url",
"image_url": {"url": "https://example.com/fake.png"},
},
],
},
{"role": "user", "content": "Hello"},
]
assert extract_mm_urls(messages) is None
def test_handles_malformed_content_non_dict():
"""Non-dict items in content list should be skipped, not crash."""
messages = [
{
"role": "user",
"content": [
"a plain string instead of a dict",
42,
None,
{
"type": "image_url",
"image_url": {"url": "https://example.com/ok.png"},
},
],
}
]
result = extract_mm_urls(messages)
assert result == {"image_url": [{"Url": "https://example.com/ok.png"}]}
...@@ -30,3 +30,48 @@ class PreprocessError(Exception): ...@@ -30,3 +30,48 @@ class PreprocessError(Exception):
def __init__(self, error_dict: dict[str, Any]): def __init__(self, error_dict: dict[str, Any]):
self.error_dict = error_dict self.error_dict = error_dict
super().__init__(str(error_dict)) super().__init__(str(error_dict))
# Content part types that carry media URLs, mapped to the key used in the
# multimodal data dict sent to the backend handler.
_MEDIA_CONTENT_TYPES = ("image_url", "audio_url", "video_url")
def extract_mm_urls(
messages: list[dict[str, Any]],
) -> dict[str, list[dict[str, str]]] | None:
"""Extract multimodal URLs from OpenAI chat completion messages.
Walks user message content arrays and collects ``image_url``, ``audio_url``,
and ``video_url`` entries. Returns them in the format expected by the
backend handler's ``_extract_multimodal_data()``::
{
"image_url": [{"Url": "https://..."}, ...],
"audio_url": [{"Url": "data:audio/wav;base64,..."}],
}
Returns ``None`` if no multimodal content is found.
"""
mm_data: dict[str, list[dict[str, str]]] = {}
for msg in messages:
if not isinstance(msg, dict) or msg.get("role") != "user":
continue
content = msg.get("content")
if not isinstance(content, list):
continue
for part in content:
if not isinstance(part, dict):
continue
part_type = part.get("type")
if part_type not in _MEDIA_CONTENT_TYPES:
continue
media_value = part.get(part_type)
if not isinstance(media_value, dict):
continue
url = media_value.get("url")
if isinstance(url, str) and url:
mm_data.setdefault(part_type, []).append({"Url": url})
return mm_data or None
...@@ -37,7 +37,7 @@ from dynamo.llm import ( ...@@ -37,7 +37,7 @@ from dynamo.llm import (
from dynamo.runtime import Client, DistributedRuntime from dynamo.runtime import Client, DistributedRuntime
from .prepost import StreamingPostProcessor, preprocess_chat_request from .prepost import StreamingPostProcessor, preprocess_chat_request
from .utils import random_uuid from .utils import extract_mm_urls, random_uuid
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -121,6 +121,23 @@ class VllmProcessor: ...@@ -121,6 +121,23 @@ class VllmProcessor:
) -> AsyncGenerator[dict[str, Any], None]: ) -> AsyncGenerator[dict[str, Any], None]:
request_id = random_uuid() request_id = random_uuid()
# vLLM's Pydantic model requires image_url.detail to be 'auto'/'low'/'high'.
# The Rust HTTP layer accepts None/missing, so normalize before validation.
messages = request.get("messages") or []
for msg in messages:
if not isinstance(msg, dict):
continue
content = msg.get("content")
if not isinstance(content, list):
continue
for part in content:
if not isinstance(part, dict):
continue
if part.get("type") == "image_url":
img_url = part.get("image_url")
if isinstance(img_url, dict) and img_url.get("detail") is None:
img_url["detail"] = "auto"
pre = await preprocess_chat_request( pre = await preprocess_chat_request(
request, request,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
...@@ -251,6 +268,11 @@ class VllmProcessor: ...@@ -251,6 +268,11 @@ class VllmProcessor:
"annotations": [], "annotations": [],
} }
# Forward multimodal URLs so the backend handler can load the media.
mm_data = extract_mm_urls(request.get("messages") or [])
if mm_data:
dynamo_preproc["multi_modal_data"] = mm_data
post = StreamingPostProcessor( post = StreamingPostProcessor(
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
request_for_sampling=request_for_sampling, request_for_sampling=request_for_sampling,
...@@ -290,6 +312,7 @@ class VllmProcessor: ...@@ -290,6 +312,7 @@ class VllmProcessor:
stop_conditions=dynamo_preproc["stop_conditions"], stop_conditions=dynamo_preproc["stop_conditions"],
sampling_options=dynamo_preproc["sampling_options"], sampling_options=dynamo_preproc["sampling_options"],
output_options=dynamo_preproc["output_options"], output_options=dynamo_preproc["output_options"],
multi_modal_data=dynamo_preproc.get("multi_modal_data"),
) )
else: else:
dynamo_stream = await self.router.generate( dynamo_stream = await self.router.generate(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment