Unverified Commit 903f8184 authored by GuanLuo's avatar GuanLuo Committed by GitHub
Browse files

chore: add multimodal image benchmark scripts for performance evaluation (#5509)


Signed-off-by: default avatarGuan Luo <41310872+GuanLuo@users.noreply.github.com>
parent 8e72fb69
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
CONCURRENCY=1
IMG_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
# Create a JSONL file with 11 identical large image URLs
# NOTE: any kind of caching can significantly affect the benchmark results,
# should make sure what you are doing.
echo '{"images": ["'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'", "'"$IMG_URL"'"]}' \
> data_large.jsonl
echo "This benchmark uses duplicate image urls, so any kind of caching can significantly affect the benchmark results, please make sure the caching setting is properly configured for your experiment."
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--concurrency)
CONCURRENCY=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --concurrency <level> Specify the concurrency level to use (default: $CONCURRENCY)"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
aiperf profile -m $MODEL_NAME --endpoint-type chat \
--synthetic-input-tokens-mean 1 --synthetic-input-tokens-stddev 0 \
--streaming --request-count 20 --warmup-request-count 2 \
--concurrency $CONCURRENCY --osl 1 \
--input-file data_large.jsonl \
--custom-dataset-type single_turn --ui none \
--no-server-metrics
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
CONCURRENCY=1
# 640 * 424 pixels image
IMG_URL="http://images.cocodataset.org/test2017/000000155781.jpg"
# Create a JSONL file with 12 identical small image URLs
# NOTE: any kind of caching can significantly affect the benchmark results,
# should make sure what you are doing.
echo '{"images": ["'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'"]}' \
> data_small.jsonl
echo "This benchmark uses duplicate image urls, so any kind of caching can significantly affect the benchmark results, please make sure the caching setting is properly configured for your experiment."
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--concurrency)
CONCURRENCY=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --concurrency <level> Specify the concurrency level to use (default: $CONCURRENCY)"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
aiperf profile -m $MODEL_NAME --endpoint-type chat \
--synthetic-input-tokens-mean 1 --synthetic-input-tokens-stddev 0 \
--streaming --request-count 100 --warmup-request-count 2 \
--concurrency $CONCURRENCY --osl 1 \
--input-file data_small.jsonl \
--custom-dataset-type single_turn --ui none \
--no-server-metrics
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
CONCURRENCY=1
# 500 * 333 pixels image
IMG_URL="http://images.cocodataset.org/test2017/000000000183.jpg"
# Create a JSONL file with 30 identical small image URLs
# NOTE: any kind of caching can significantly affect the benchmark results,
# should make sure what you are doing.
# ~ 11 tokens
DUMMY_PROMPT="This is a prompt to describe the image content briefly."
for i in {1..1500}; do
DUMMY_PROMPT+=" This is a prompt to describe the image content briefly."
done
echo '{"texts": ["'"$DUMMY_PROMPT"'"], "images": ["'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'","'"$IMG_URL"'"]}' \
> data_small.jsonl
echo "This benchmark uses duplicate image urls, so any kind of caching can significantly affect the benchmark results, please make sure the caching setting is properly configured for your experiment."
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--concurrency)
CONCURRENCY=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --concurrency <level> Specify the concurrency level to use (default: $CONCURRENCY)"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
aiperf profile -m $MODEL_NAME --endpoint-type chat \
--streaming --request-count 100 --warmup-request-count 5 \
--concurrency $CONCURRENCY --osl 1 \
--input-file data_small.jsonl \
--custom-dataset-type single_turn --ui none \
--no-server-metrics
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from http.server import BaseHTTPRequestHandler, HTTPServer
from io import BytesIO
from urllib.parse import urlparse
import requests
# [NOTE] this is keep as a reference in case we need to run a local media server to eliminate image server influence.
# However, this implementation is not used as it is actually slower than directly using public image URLs in our benchmark experiments.
#
# Example usage:
# python local_media_server.py \
# --image test.jpg:https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg &
# IMG_SERVER_PID=$!
# trap "kill $IMG_SERVER_PID" EXIT
# # Wait for the server to start
# for i in {1..10}; do
# HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" localhost:8233/test.jpg)
# if [[ "$HTTP_CODE" -eq 200 ]]; then
# echo "Server is responding with HTTP 200."
# break
# else
# echo "Server did not respond with HTTP 200. Response code: $HTTP_CODE. Retrying in 1 second..."
# sleep 1
# fi
# if [[ $i -eq 10 ]]; then
# echo "Server did not respond with HTTP 200 after 10 attempts. Exiting."
# exit 1
# fi
# done
class LocalMediaServer(BaseHTTPRequestHandler):
image_store = {}
@classmethod
def initialize_images(cls, images):
for name, url in images.items():
try:
response = requests.get(url)
if response.status_code == 200:
cls.image_store[name] = BytesIO(response.content)
else:
print(f"Failed to load image from {url}")
except Exception as e:
print(f"Error loading image from {url}: {e}")
def do_GET(self):
parsed_path = urlparse(self.path)
resource = parsed_path.path.lstrip("/")
if resource and resource in self.image_store:
self.send_response(200)
self.send_header("Content-type", "image/jpeg")
self.end_headers()
self.wfile.write(self.image_store[resource].getvalue())
else:
self.send_response(404)
self.end_headers()
self.wfile.write(b"Image not found")
def run_server(port, images):
LocalMediaServer.initialize_images(images)
server_address = ("", port)
httpd = HTTPServer(server_address, LocalMediaServer)
print(f"Server running on port {port}")
httpd.serve_forever()
if __name__ == "__main__":
# Example usage
parser = argparse.ArgumentParser(description="Start a local media server.")
parser.add_argument(
"--image",
action="append",
help='Specify images in the format "file_name:url". Can be used multiple times.',
required=True,
)
parser.add_argument(
"--port",
type=int,
default=8233,
help="Specify the port number for the server. Default is 8233.",
)
args = parser.parse_args()
images = {}
for image_arg in args.image:
try:
file_name, url = image_arg.split(":", 1)
images[file_name] = url
except ValueError:
print(
f"Invalid format for image argument: {image_arg}. Expected format is 'file_name:url'."
)
exit(1)
run_server(args.port, images)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import asyncio
import logging
import os
import shutil
......@@ -142,7 +143,8 @@ class EncodeWorkerHandler:
image_embeds = self.image_processor(images=image, return_tensors="pt")
# Encode the image embeddings using model-specific encoder
embeddings = encode_image_embeddings(
embeddings = await asyncio.to_thread(
encode_image_embeddings,
model_name=self.model,
image_embeds=image_embeds,
vision_encoder=self.vision_encoder,
......
......@@ -16,6 +16,7 @@
import hashlib
import json
import logging
import os
from typing import Any, Dict, Optional
import torch
......@@ -25,6 +26,10 @@ from .model import SupportedModels, is_model_supported, is_qwen_vl_model
logger = logging.getLogger(__name__)
# [gluo NOTE] Debug flag to compare vLLM encoder vs transformers encoder,
# should be removed once there is proper way to extract vLLM encoder.
VLLM_ENCODER = int(os.getenv("VLLM_ENCODER", 1))
def get_embedding_hash(key: str) -> str:
"""
......@@ -54,6 +59,16 @@ def get_qwen_image_features(
Raises:
ValueError: If grid_thw is not provided for Qwen model
"""
logger.debug(f"Encoding image of shape: {image_embeds['pixel_values'].shape}")
if VLLM_ENCODER:
pixel_values = image_embeds["pixel_values"].to(vision_encoder.device)
grid_thw = image_embeds.get("image_grid_thw")
if grid_thw is None:
raise ValueError("grid_thw is not provided")
grid_thw = grid_thw.tolist()
image_embeds = vision_encoder(pixel_values, grid_thw=grid_thw)
return image_embeds
pixel_values = image_embeds["pixel_values"].to(vision_encoder.device)
grid_thw = image_embeds.get("image_grid_thw", None)
......
......@@ -14,14 +14,21 @@
# limitations under the License.
import logging
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
import torch
from transformers import AutoModel
from vllm import LLM
from vllm.utils.system_utils import update_environment_variables
logger = logging.getLogger(__name__)
# [gluo NOTE] Debug flag to compare vLLM encoder vs transformers encoder,
# should be removed once there is proper way to extract vLLM encoder.
VLLM_ENCODER = int(os.getenv("VLLM_ENCODER", 1))
class SupportedModels:
"""Supported multimodal model identifiers"""
......@@ -129,10 +136,27 @@ def load_vision_model(model_id: str) -> torch.nn.Module:
"""
Load a vision model from a HuggingFace model ID.
"""
model = AutoModel.from_pretrained(
if VLLM_ENCODER and is_qwen_vl_model(model_id):
# Disable to get ViT from the same process
update_environment_variables(
{
"VLLM_ENABLE_V1_MULTIPROCESSING": "0",
}
)
# [gluo NOTE] this actually loads the full model,
# which require more GPU memory than needed.
vllm_model = LLM(
model=model_id,
enforce_eager=True,
gpu_memory_utilization=0.4,
max_model_len=10,
)
return (
vllm_model.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.model.visual
)
return AutoModel.from_pretrained(
model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
)
return model
def construct_mm_data(
......
......@@ -57,7 +57,7 @@ MODEL_SPECIFIC_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
fi
# Start vLLM worker with vision model
......@@ -66,7 +66,7 @@ fi
# --connector none: No KV transfer needed for aggregated serving
# Extra args from command line come last to allow overrides
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --connector none $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
# Wait for all background processes to complete
wait
......
......@@ -55,10 +55,10 @@ python -m dynamo.frontend &
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
EXTRA_ARGS=""
if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.5 --enforce-eager --max-model-len 30426"
EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len 30426"
else
# Multi-GPU mode: standard memory settings
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 34096"
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 30426"
fi
# Start processor (Python-based preprocessing, handles prompt templating)
......@@ -69,10 +69,13 @@ python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_
if [[ "$SINGLE_GPU" == "true" ]]; then
# Single GPU mode: both workers share GPU 0 with reduced memory
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
# Now that encode worker and PD worker are vLLM engine, need to ensure encode worker and PD worker are not initialized concurrently
# on the same GPU to avoid influencing each other's startup process (checks and allocations).
sleep 60
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
else
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
fi
# Wait for all background processes to complete
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment