Unverified Commit e7544f19 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

ci: Testing b64 multimodal support with real image (#4280)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
Signed-off-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Co-authored-by: default avatarHarrison Saturley-Hall <hsaturleyhal@nvidia.com>
parent 6e241236
......@@ -25,6 +25,7 @@ pytest-benchmark
pytest-codeblocks
pytest-cov
pytest-forked
pytest-httpserver
pytest-md-report
pytest-mypy
pytest-timeout
......
......@@ -39,13 +39,18 @@ while [[ $# -gt 0 ]]; do
esac
done
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp
# Start frontend with Rust OpenAIPreprocessor
python -m dynamo.frontend --http-port=8000 &
# Configure GPU memory optimization for specific models
EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
fi
......
......@@ -69,7 +69,9 @@ python -m dynamo.frontend --http-port=8000 &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
fi
# Start processor (Python-based preprocessing, handles prompt templating)
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from pytest_httpserver import HTTPServer
from dynamo.common.utils.paths import WORKSPACE_DIR
# Shared constants for multimodal testing
IMAGE_SERVER_PORT = 8765
MULTIMODAL_IMG_PATH = os.path.join(
WORKSPACE_DIR, "lib/llm/tests/data/media/llm-optimize-deploy-graphic.png"
)
MULTIMODAL_IMG_URL = f"http://localhost:{IMAGE_SERVER_PORT}/llm-graphic.png"
@pytest.fixture(scope="session")
def httpserver_listen_address():
return ("127.0.0.1", IMAGE_SERVER_PORT)
@pytest.fixture(scope="function")
def image_server(httpserver: HTTPServer):
"""
Provide an HTTP server that serves test images for multimodal inference.
This function-scoped fixture configures pytest-httpserver to serve
the LLM optimization diagram image. It's designed for testing multimodal
inference capabilities where models need to fetch images via HTTP.
Currently serves:
- /llm-graphic.png - LLM diagram image for multimodal tests
Usage:
def test_multimodal(image_server):
url = "http://localhost:8765/llm-graphic.png"
# ... use url in your test payload
"""
# Load LLM graphic image from shared test data
with open(MULTIMODAL_IMG_PATH, "rb") as f:
image_data = f.read()
# Configure server endpoint
httpserver.expect_request("/llm-graphic.png").respond_with_data(
image_data,
content_type="image/png",
)
return httpserver
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import base64
import logging
import os
from dataclasses import dataclass, field
......@@ -12,6 +13,7 @@ from tests.serve.common import (
params_with_model_mark,
run_serve_deployment,
)
from tests.serve.conftest import MULTIMODAL_IMG_PATH, MULTIMODAL_IMG_URL
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
chat_payload,
......@@ -34,6 +36,7 @@ vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/vllm"
)
# vLLM test configurations
vllm_configs = {
"aggregated": VLLMConfig(
......@@ -151,17 +154,19 @@ vllm_configs = {
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["bus"],
expected_response=["purple"],
temperature=0.0,
max_tokens=100,
)
],
),
......@@ -177,16 +182,18 @@ vllm_configs = {
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["bus"],
expected_response=["purple"],
max_tokens=100,
)
],
),
......@@ -200,33 +207,20 @@ vllm_configs = {
delayed_start=0,
timeout=360,
request_payloads=[
# HTTP URL test
chat_payload(
[
{"type": "text", "text": "What is in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
},
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
],
repeat_count=1,
expected_response=["bus"],
),
# Base64 data URL test (1x1 PNG inline, avoids network fetch)
chat_payload(
[
{"type": "text", "text": "What do you see in this image?"},
{
"type": "image_url",
"image_url": {
"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
},
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=[], # Just validate no error
expected_response=["purple"],
max_tokens=100,
),
],
),
......@@ -343,10 +337,57 @@ def vllm_config_test(request):
@pytest.mark.vllm
@pytest.mark.e2e
def test_serve_deployment(
vllm_config_test, request, runtime_services, predownload_models
vllm_config_test, request, runtime_services, predownload_models, image_server
):
"""
Test dynamo serve deployments with different graph configurations.
"""
config = vllm_config_test
run_serve_deployment(config, request)
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_2
def test_multimodal_b64(request, runtime_services, predownload_models):
"""
Test multimodal inference with base64 url passthrough.
This test is separate because it loads the required image at runtime
(not collection time), ensuring it only fails when actually executed.
"""
# Load B64 image at test execution time
with open(MULTIMODAL_IMG_PATH, "rb") as f:
b64_img = base64.b64encode(f.read()).decode()
# Create payload with B64 image
b64_payload = chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_img}"},
},
],
repeat_count=1,
expected_response=["purple"],
max_tokens=100,
)
# Create test config
config = VLLMConfig(
name="test_multimodal_b64",
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[], # markers at function-level
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
timeout=360,
request_payloads=[b64_payload],
)
run_serve_deployment(config, request)
......@@ -3,7 +3,9 @@
import json
import logging
import re
import time
from copy import deepcopy
from typing import Any, Dict
import requests
......@@ -11,6 +13,39 @@ import requests
logger = logging.getLogger(__name__)
def _truncate_base64_url(url: str, max_length: int = 100) -> str:
"""Helper to truncate a single base64 data URL."""
if (m := re.match(r"^(data:image/[^;]+;base64,)(.+)$", url)) and len(
m.group(2)
) > max_length:
data = m.group(2)
return f"{m.group(1)}{data[:max_length]}...<{len(data)} chars, truncated>"
return url
def _sanitize_payload_for_logging(payload: Dict[str, Any]) -> Dict[str, Any]:
"""
Truncate base64-encoded images in multimodal payloads for cleaner logging.
Multimodal payloads can contain base64 images with multiple MB of data in
the field "type": "image_url", "image_url": "data: ... <MB of data>"
"""
sanitized = deepcopy(payload)
# Handle chat completions with multimodal content
if "messages" in sanitized:
for message in sanitized["messages"]:
content = message.get("content")
# Content can be string or list of content parts (multimodal)
if isinstance(content, list):
for part in content:
if isinstance(part, dict) and part.get("type") == "image_url":
image_url = part.get("image_url", {})
if "url" in image_url:
image_url["url"] = _truncate_base64_url(image_url["url"])
return sanitized
def send_request(
url: str,
payload: Dict[str, Any],
......@@ -35,7 +70,10 @@ def send_request(
"""
method_upper = method.upper()
payload_json = json.dumps(payload, indent=2)
# Sanitize payload for logging (truncate base64 images)
sanitized_payload = _sanitize_payload_for_logging(payload)
payload_json = json.dumps(sanitized_payload, indent=2)
curl_command = ""
if method_upper == "GET":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment