ci: Testing b64 multimodal support with real image (#4280)

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Harrison Saturley-Hall <hsaturleyhal@nvidia.com>

ci: Testing b64 multimodal support with real image (#4280)
Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Harrison Saturley-Hall <hsaturleyhal@nvidia.com>
e7544f19 · KrishnanPrash · GitHub · 6e241236 · e7544f19 · e7544f19
Unverified Commit e7544f19 authored Nov 20, 2025 by KrishnanPrash Committed by GitHub Nov 21, 2025
6 changed files
--- a/container/deps/requirements.test.txt
+++ b/container/deps/requirements.test.txt
@@ -25,6 +25,7 @@ pytest-benchmark
 pytest-codeblocks
 pytest-cov
 pytest-forked
+pytest-httpserver
 pytest-md-report
 pytest-mypy
 pytest-timeout

--- a/examples/backends/vllm/launch/agg_multimodal.sh
+++ b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -39,13 +39,18 @@ while [[ $# -gt 0 ]]; do
    esac
 done
+# Use TCP transport (instead of default NATS)
+# TCP is preferred for multimodal workloads because it overcomes:
+# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
+export DYN_REQUEST_PLANE=tcp
 # Start frontend with Rust OpenAIPreprocessor
 python -m dynamo.frontend --http-port=8000 &
 # Configure GPU memory optimization for specific models
 EXTRA_ARGS=""
 if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
 elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
 fi

--- a/examples/backends/vllm/launch/agg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/agg_multimodal_epd.sh
@@ -69,7 +69,9 @@ python -m dynamo.frontend --http-port=8000 &
 # To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
 EXTRA_ARGS=""
 if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
+elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
 fi
 # Start processor (Python-based preprocessing, handles prompt templating)

--- a/tests/serve/conftest.py
+++ b/tests/serve/conftest.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import os
+import pytest
+from pytest_httpserver import HTTPServer
+from dynamo.common.utils.paths import WORKSPACE_DIR
+# Shared constants for multimodal testing
+IMAGE_SERVER_PORT = 8765
+MULTIMODAL_IMG_PATH = os.path.join(
+    WORKSPACE_DIR, "lib/llm/tests/data/media/llm-optimize-deploy-graphic.png"
+)
+MULTIMODAL_IMG_URL = f"http://localhost:{IMAGE_SERVER_PORT}/llm-graphic.png"
+@pytest.fixture(scope="session")
+def httpserver_listen_address():
+    return ("127.0.0.1", IMAGE_SERVER_PORT)
+@pytest.fixture(scope="function")
+def image_server(httpserver: HTTPServer):
+    """
+    Provide an HTTP server that serves test images for multimodal inference.
+    This function-scoped fixture configures pytest-httpserver to serve
+    the LLM optimization diagram image. It's designed for testing multimodal
+    inference capabilities where models need to fetch images via HTTP.
+    Currently serves:
+        - /llm-graphic.png - LLM diagram image for multimodal tests
+    Usage:
+        def test_multimodal(image_server):
+            url = "http://localhost:8765/llm-graphic.png"
+            # ... use url in your test payload
+    """
+    # Load LLM graphic image from shared test data
+    with open(MULTIMODAL_IMG_PATH, "rb") as f:
+        image_data = f.read()
+    # Configure server endpoint
+    httpserver.expect_request("/llm-graphic.png").respond_with_data(
+        image_data,
+        content_type="image/png",
+    )
+    return httpserver
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import base64
 import logging
 import os
 from dataclasses import dataclass, field
@@ -12,6 +13,7 @@ from tests.serve.common import (
    params_with_model_mark,
    run_serve_deployment,
 )
+from tests.serve.conftest import MULTIMODAL_IMG_PATH, MULTIMODAL_IMG_URL
 from tests.utils.engine_process import EngineConfig
 from tests.utils.payload_builder import (
    chat_payload,
@@ -34,6 +36,7 @@ vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/vllm"
 )
 # vLLM test configurations
 vllm_configs = {
    "aggregated": VLLMConfig(
@@ -151,17 +154,19 @@ vllm_configs = {
        request_payloads=[
            chat_payload(
                [
-                    {"type": "text", "text": "What is in this image?"},
                    {
-                        "type": "image_url",
+                        "type": "text",
-                        "image_url": {
+                        "text": "What colors are in the following image? Respond only with the colors.",
-                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": MULTIMODAL_IMG_URL},
                    },
                ],
                repeat_count=1,
-                expected_response=["bus"],
+                expected_response=["purple"],
                temperature=0.0,
+                max_tokens=100,
            )
        ],
    ),
@@ -177,16 +182,18 @@ vllm_configs = {
        request_payloads=[
            chat_payload(
                [
-                    {"type": "text", "text": "What is in this image?"},
                    {
-                        "type": "image_url",
+                        "type": "text",
-                        "image_url": {
+                        "text": "What colors are in the following image? Respond only with the colors.",
-                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": MULTIMODAL_IMG_URL},
                    },
                ],
                repeat_count=1,
-                expected_response=["bus"],
+                expected_response=["purple"],
+                max_tokens=100,
            )
        ],
    ),
@@ -200,33 +207,20 @@ vllm_configs = {
        delayed_start=0,
        timeout=360,
        request_payloads=[
-            # HTTP URL test
            chat_payload(
                [
-                    {"type": "text", "text": "What is in this image?"},
                    {
-                        "type": "image_url",
+                        "type": "text",
-                        "image_url": {
+                        "text": "What colors are in the following image? Respond only with the colors.",
-                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
-                        },
                    },
-                ],
-                repeat_count=1,
-                expected_response=["bus"],
-            ),
-            # Base64 data URL test (1x1 PNG inline, avoids network fetch)
-            chat_payload(
-                [
-                    {"type": "text", "text": "What do you see in this image?"},
                    {
                        "type": "image_url",
-                        "image_url": {
+                        "image_url": {"url": MULTIMODAL_IMG_URL},
-                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
-                        },
                    },
                ],
                repeat_count=1,
-                expected_response=[],  # Just validate no error
+                expected_response=["purple"],
+                max_tokens=100,
            ),
        ],
    ),
@@ -343,10 +337,57 @@ def vllm_config_test(request):
 @pytest.mark.vllm
 @pytest.mark.e2e
 def test_serve_deployment(
-    vllm_config_test, request, runtime_services, predownload_models
+    vllm_config_test, request, runtime_services, predownload_models, image_server
 ):
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
    run_serve_deployment(config, request)
+@pytest.mark.vllm
+@pytest.mark.e2e
+@pytest.mark.gpu_2
+def test_multimodal_b64(request, runtime_services, predownload_models):
+    """
+    Test multimodal inference with base64 url passthrough.
+    This test is separate because it loads the required image at runtime
+    (not collection time), ensuring it only fails when actually executed.
+    """
+    # Load B64 image at test execution time
+    with open(MULTIMODAL_IMG_PATH, "rb") as f:
+        b64_img = base64.b64encode(f.read()).decode()
+    # Create payload with B64 image
+    b64_payload = chat_payload(
+        [
+            {
+                "type": "text",
+                "text": "What colors are in the following image? Respond only with the colors.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
+            },
+        ],
+        repeat_count=1,
+        expected_response=["purple"],
+        max_tokens=100,
+    )
+    # Create test config
+    config = VLLMConfig(
+        name="test_multimodal_b64",
+        directory=vllm_dir,
+        script_name="agg_multimodal.sh",
+        marks=[],  # markers at function-level
+        model="Qwen/Qwen2.5-VL-7B-Instruct",
+        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
+        delayed_start=0,
+        timeout=360,
+        request_payloads=[b64_payload],
+    )
+    run_serve_deployment(config, request)
--- a/tests/utils/client.py
+++ b/tests/utils/client.py
@@ -3,7 +3,9 @@
 import json
 import logging
+import re
 import time
+from copy import deepcopy
 from typing import Any, Dict
 import requests
@@ -11,6 +13,39 @@ import requests
 logger = logging.getLogger(__name__)
+def _truncate_base64_url(url: str, max_length: int = 100) -> str:
+    """Helper to truncate a single base64 data URL."""
+    if (m := re.match(r"^(data:image/[^;]+;base64,)(.+)$", url)) and len(
+        m.group(2)
+    ) > max_length:
+        data = m.group(2)
+        return f"{m.group(1)}{data[:max_length]}...<{len(data)} chars, truncated>"
+    return url
+def _sanitize_payload_for_logging(payload: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Truncate base64-encoded images in multimodal payloads for cleaner logging.
+    Multimodal payloads can contain base64 images with multiple MB of data in
+    the field "type": "image_url", "image_url": "data: ... <MB of data>"
+    """
+    sanitized = deepcopy(payload)
+    # Handle chat completions with multimodal content
+    if "messages" in sanitized:
+        for message in sanitized["messages"]:
+            content = message.get("content")
+            # Content can be string or list of content parts (multimodal)
+            if isinstance(content, list):
+                for part in content:
+                    if isinstance(part, dict) and part.get("type") == "image_url":
+                        image_url = part.get("image_url", {})
+                        if "url" in image_url:
+                            image_url["url"] = _truncate_base64_url(image_url["url"])
+    return sanitized
 def send_request(
    url: str,
    payload: Dict[str, Any],
@@ -35,7 +70,10 @@ def send_request(
    """
    method_upper = method.upper()
-    payload_json = json.dumps(payload, indent=2)
+    # Sanitize payload for logging (truncate base64 images)
+    sanitized_payload = _sanitize_payload_for_logging(payload)
+    payload_json = json.dumps(sanitized_payload, indent=2)
    curl_command = ""
    if method_upper == "GET":