chore: add multimodal image benchmark scripts for performance evaluation (#5509)

Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com>

chore: add multimodal image benchmark scripts for performance evaluation (#5509)
Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com>
903f8184 · GuanLuo · GitHub · 8e72fb69 · 903f8184 · 903f8184
Unverified Commit 903f8184 authored Jan 27, 2026 by GuanLuo Committed by GitHub Jan 27, 2026
9 changed files
--- a/benchmarks/multimodal/image/aiperf_large_image.sh
+++ b/benchmarks/multimodal/image/aiperf_large_image.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
+CONCURRENCY=1
+
+IMG_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
+
+# Create a JSONL file with 11 identical large image URLs
+# NOTE: any kind of caching can significantly affect the benchmark results,
+# should make sure what you are doing.
+echo '{"images": ["'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'"]}' \
+    > data_large.jsonl
+echo "This benchmark uses duplicate image urls, so any kind of caching can significantly affect the benchmark results, please make sure the caching setting is properly configured for your experiment."
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        --concurrency)
+            CONCURRENCY=$2
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
+            echo "  --concurrency <level> Specify the concurrency level to use (default: $CONCURRENCY)"
+            echo "  -h, --help           Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+aiperf profile -m $MODEL_NAME --endpoint-type chat \
+    --synthetic-input-tokens-mean 1 --synthetic-input-tokens-stddev 0 \
+    --streaming --request-count 20 --warmup-request-count 2 \
+    --concurrency $CONCURRENCY --osl 1 \
+    --input-file data_large.jsonl \
+    --custom-dataset-type single_turn --ui none \
+    --no-server-metrics
--- a/benchmarks/multimodal/image/aiperf_small_image.sh
+++ b/benchmarks/multimodal/image/aiperf_small_image.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
+CONCURRENCY=1
+
+# 640 * 424 pixels image
+IMG_URL="http://images.cocodataset.org/test2017/000000155781.jpg"
+
+# Create a JSONL file with 12 identical small image URLs
+# NOTE: any kind of caching can significantly affect the benchmark results,
+# should make sure what you are doing.
+echo '{"images": ["'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'"]}' \
+    > data_small.jsonl
+echo "This benchmark uses duplicate image urls, so any kind of caching can significantly affect the benchmark results, please make sure the caching setting is properly configured for your experiment."
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        --concurrency)
+            CONCURRENCY=$2
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
+            echo "  --concurrency <level> Specify the concurrency level to use (default: $CONCURRENCY)"
+            echo "  -h, --help           Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+aiperf profile -m $MODEL_NAME --endpoint-type chat \
+    --synthetic-input-tokens-mean 1 --synthetic-input-tokens-stddev 0 \
+    --streaming --request-count 100 --warmup-request-count 2 \
+    --concurrency $CONCURRENCY --osl 1 \
+    --input-file data_small.jsonl \
+    --custom-dataset-type single_turn --ui none \
+    --no-server-metrics
--- a/benchmarks/multimodal/image/aiperf_small_image_50_isl_ratio.sh
+++ b/benchmarks/multimodal/image/aiperf_small_image_50_isl_ratio.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
+CONCURRENCY=1
+
+# 500 * 333 pixels image
+IMG_URL="http://images.cocodataset.org/test2017/000000000183.jpg"
+
+# Create a JSONL file with 30 identical small image URLs
+# NOTE: any kind of caching can significantly affect the benchmark results,
+# should make sure what you are doing.
+# ~ 11 tokens
+DUMMY_PROMPT="This is a prompt to describe the image content briefly."
+for i in {1..1500}; do
+    DUMMY_PROMPT+=" This is a prompt to describe the image content briefly."
+done
+echo '{"texts": ["'"$DUMMY_PROMPT"'"], "images": ["'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'"]}' \
+    > data_small.jsonl
+echo "This benchmark uses duplicate image urls, so any kind of caching can significantly affect the benchmark results, please make sure the caching setting is properly configured for your experiment."
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        --concurrency)
+            CONCURRENCY=$2
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
+            echo "  --concurrency <level> Specify the concurrency level to use (default: $CONCURRENCY)"
+            echo "  -h, --help           Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+aiperf profile -m $MODEL_NAME --endpoint-type chat \
+    --streaming --request-count 100 --warmup-request-count 5 \
+    --concurrency $CONCURRENCY --osl 1 \
+    --input-file data_small.jsonl \
+    --custom-dataset-type single_turn --ui none \
+    --no-server-metrics
--- a/benchmarks/multimodal/local_media_server.py
+++ b/benchmarks/multimodal/local_media_server.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from io import BytesIO
+from urllib.parse import urlparse
+
+import requests
+
+# [NOTE] this is keep as a reference in case we need to run a local media server to eliminate image server influence.
+# However, this implementation is not used as it is actually slower than directly using public image URLs in our benchmark experiments.
+#
+# Example usage:
+# python local_media_server.py \
+#     --image test.jpg:https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg &
+# IMG_SERVER_PID=$!
+# trap "kill $IMG_SERVER_PID" EXIT
+
+# # Wait for the server to start
+# for i in {1..10}; do
+#     HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" localhost:8233/test.jpg)
+#     if [[ "$HTTP_CODE" -eq 200 ]]; then
+#         echo "Server is responding with HTTP 200."
+#         break
+#     else
+#         echo "Server did not respond with HTTP 200. Response code: $HTTP_CODE. Retrying in 1 second..."
+#         sleep 1
+#     fi
+#     if [[ $i -eq 10 ]]; then
+#         echo "Server did not respond with HTTP 200 after 10 attempts. Exiting."
+#         exit 1
+#     fi
+# done
+
+
+class LocalMediaServer(BaseHTTPRequestHandler):
+    image_store = {}
+
+    @classmethod
+    def initialize_images(cls, images):
+        for name, url in images.items():
+            try:
+                response = requests.get(url)
+                if response.status_code == 200:
+                    cls.image_store[name] = BytesIO(response.content)
+                else:
+                    print(f"Failed to load image from {url}")
+            except Exception as e:
+                print(f"Error loading image from {url}: {e}")
+
+    def do_GET(self):
+        parsed_path = urlparse(self.path)
+        resource = parsed_path.path.lstrip("/")
+
+        if resource and resource in self.image_store:
+            self.send_response(200)
+            self.send_header("Content-type", "image/jpeg")
+            self.end_headers()
+            self.wfile.write(self.image_store[resource].getvalue())
+        else:
+            self.send_response(404)
+            self.end_headers()
+            self.wfile.write(b"Image not found")
+
+
+def run_server(port, images):
+    LocalMediaServer.initialize_images(images)
+    server_address = ("", port)
+    httpd = HTTPServer(server_address, LocalMediaServer)
+    print(f"Server running on port {port}")
+    httpd.serve_forever()
+
+
+if __name__ == "__main__":
+    # Example usage
+    parser = argparse.ArgumentParser(description="Start a local media server.")
+    parser.add_argument(
+        "--image",
+        action="append",
+        help='Specify images in the format "file_name:url". Can be used multiple times.',
+        required=True,
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8233,
+        help="Specify the port number for the server. Default is 8233.",
+    )
+    args = parser.parse_args()
+
+    images = {}
+    for image_arg in args.image:
+        try:
+            file_name, url = image_arg.split(":", 1)
+            images[file_name] = url
+        except ValueError:
+            print(
+                f"Invalid format for image argument: {image_arg}. Expected format is 'file_name:url'."
+            )
+            exit(1)
+    run_server(args.port, images)
--- a/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py
+++ b/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

+import asyncio
 import logging
 import os
 import shutil
@@ -142,7 +143,8 @@ class EncodeWorkerHandler:
                image_embeds = self.image_processor(images=image, return_tensors="pt")

                # Encode the image embeddings using model-specific encoder
-                embeddings = encode_image_embeddings(
+                embeddings = await asyncio.to_thread(
+                    encode_image_embeddings,
                    model_name=self.model,
                    image_embeds=image_embeds,
                    vision_encoder=self.vision_encoder,

--- a/components/src/dynamo/vllm/multimodal_utils/encode_utils.py
+++ b/components/src/dynamo/vllm/multimodal_utils/encode_utils.py
@@ -16,6 +16,7 @@
 import hashlib
 import json
 import logging
+import os
 from typing import Any, Dict, Optional

 import torch
@@ -25,6 +26,10 @@ from .model import SupportedModels, is_model_supported, is_qwen_vl_model

 logger = logging.getLogger(__name__)

+# [gluo NOTE] Debug flag to compare vLLM encoder vs transformers encoder,
+# should be removed once there is proper way to extract vLLM encoder.
+VLLM_ENCODER = int(os.getenv("VLLM_ENCODER", 1))
+

 def get_embedding_hash(key: str) -> str:
    """
@@ -54,6 +59,16 @@ def get_qwen_image_features(
    Raises:
        ValueError: If grid_thw is not provided for Qwen model
    """
+    logger.debug(f"Encoding image of shape: {image_embeds['pixel_values'].shape}")
+    if VLLM_ENCODER:
+        pixel_values = image_embeds["pixel_values"].to(vision_encoder.device)
+        grid_thw = image_embeds.get("image_grid_thw")
+        if grid_thw is None:
+            raise ValueError("grid_thw is not provided")
+        grid_thw = grid_thw.tolist()
+        image_embeds = vision_encoder(pixel_values, grid_thw=grid_thw)
+        return image_embeds
+
    pixel_values = image_embeds["pixel_values"].to(vision_encoder.device)

    grid_thw = image_embeds.get("image_grid_thw", None)

--- a/components/src/dynamo/vllm/multimodal_utils/model.py
+++ b/components/src/dynamo/vllm/multimodal_utils/model.py
@@ -14,14 +14,21 @@
 # limitations under the License.

 import logging
+import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional

 import torch
 from transformers import AutoModel
+from vllm import LLM
+from vllm.utils.system_utils import update_environment_variables

 logger = logging.getLogger(__name__)

+# [gluo NOTE] Debug flag to compare vLLM encoder vs transformers encoder,
+# should be removed once there is proper way to extract vLLM encoder.
+VLLM_ENCODER = int(os.getenv("VLLM_ENCODER", 1))
+

 class SupportedModels:
    """Supported multimodal model identifiers"""
@@ -129,10 +136,27 @@ def load_vision_model(model_id: str) -> torch.nn.Module:
    """
    Load a vision model from a HuggingFace model ID.
    """
-    model = AutoModel.from_pretrained(
+    if VLLM_ENCODER and is_qwen_vl_model(model_id):
+        # Disable to get ViT from the same process
+        update_environment_variables(
+            {
+                "VLLM_ENABLE_V1_MULTIPROCESSING": "0",
+            }
+        )
+        # [gluo NOTE] this actually loads the full model,
+        # which require more GPU memory than needed.
+        vllm_model = LLM(
+            model=model_id,
+            enforce_eager=True,
+            gpu_memory_utilization=0.4,
+            max_model_len=10,
+        )
+        return (
+            vllm_model.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.model.visual
+        )
+    return AutoModel.from_pretrained(
        model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
    )
-    return model


 def construct_mm_data(

--- a/examples/backends/vllm/launch/agg_multimodal.sh
+++ b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -57,7 +57,7 @@ MODEL_SPECIFIC_ARGS=""
 if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
 elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
-    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
+    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
 fi

 # Start vLLM worker with vision model
@@ -66,7 +66,7 @@ fi
 # --connector none: No KV transfer needed for aggregated serving
 # Extra args from command line come last to allow overrides
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
-    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
+    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --connector none $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"

 # Wait for all background processes to complete
 wait

--- a/examples/backends/vllm/launch/agg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/agg_multimodal_epd.sh
@@ -55,10 +55,10 @@ python -m dynamo.frontend &
 # Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
 EXTRA_ARGS=""
 if [[ "$SINGLE_GPU" == "true" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.5 --enforce-eager --max-model-len 30426"
+    EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len 30426"
 else
    # Multi-GPU mode: standard memory settings
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 34096"
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 30426"
 fi

 # Start processor (Python-based preprocessing, handles prompt templating)
@@ -69,10 +69,13 @@ python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_
 if [[ "$SINGLE_GPU" == "true" ]]; then
    # Single GPU mode: both workers share GPU 0 with reduced memory
    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
+    # Now that encode worker and PD worker are vLLM engine, need to ensure encode worker and PD worker are not initialized concurrently
+    # on the same GPU to avoid influencing each other's startup process (checks and allocations).
+    sleep 60
    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
 else
-    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
-    CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
+    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
+    CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
 fi

 # Wait for all background processes to complete