Unverified Commit b73c571f authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

feat: Add base64 and HTTP image URL support to vLLM workers (#4114)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
parent 1a9aeab4
...@@ -6,7 +6,7 @@ import logging ...@@ -6,7 +6,7 @@ import logging
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Any, AsyncGenerator, Dict from typing import Any, AsyncGenerator, Dict, Final
from vllm.inputs import TokensPrompt from vllm.inputs import TokensPrompt
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
...@@ -16,6 +16,13 @@ from dynamo.llm import ZmqKvEventPublisher ...@@ -16,6 +16,13 @@ from dynamo.llm import ZmqKvEventPublisher
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from .engine_monitor import VllmEngineMonitor from .engine_monitor import VllmEngineMonitor
from .multimodal_utils.image_loader import ImageLoader
# Multimodal data dictionary keys
IMAGE_URL_KEY: Final = "image_url"
VIDEO_URL_KEY: Final = "video_url"
URL_VARIANT_KEY: Final = "Url"
DECODED_VARIANT_KEY: Final = "Decoded"
configure_dynamo_logging() configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -65,6 +72,7 @@ class BaseWorkerHandler(ABC): ...@@ -65,6 +72,7 @@ class BaseWorkerHandler(ABC):
self.default_sampling_params = default_sampling_params self.default_sampling_params = default_sampling_params
self.kv_publishers: list[ZmqKvEventPublisher] | None = None self.kv_publishers: list[ZmqKvEventPublisher] | None = None
self.engine_monitor = VllmEngineMonitor(runtime, engine) self.engine_monitor = VllmEngineMonitor(runtime, engine)
self.image_loader = ImageLoader()
@abstractmethod @abstractmethod
async def generate(self, request, context) -> AsyncGenerator[dict, None]: async def generate(self, request, context) -> AsyncGenerator[dict, None]:
...@@ -111,6 +119,50 @@ class BaseWorkerHandler(ABC): ...@@ -111,6 +119,50 @@ class BaseWorkerHandler(ABC):
"""Override in subclasses if cleanup is needed.""" """Override in subclasses if cleanup is needed."""
pass pass
async def _extract_multimodal_data(
self, request: Dict[str, Any]
) -> Dict[str, Any] | None:
"""
Extract and decode multimodal data from PreprocessedRequest.
"""
if "multi_modal_data" not in request or request["multi_modal_data"] is None:
return None
mm_map = request["multi_modal_data"]
vllm_mm_data = {}
# Process image_url entries
images = []
for item in mm_map.get(IMAGE_URL_KEY, []):
if isinstance(item, dict) and URL_VARIANT_KEY in item:
url = item[URL_VARIANT_KEY]
try:
# ImageLoader supports both data: and http(s): URLs with caching
image = await self.image_loader.load_image(url)
images.append(image)
logger.debug(f"Loaded image from URL: {url[:80]}...")
except Exception:
logger.exception(f"Failed to load image from {url[:80]}...")
raise
elif isinstance(item, dict) and DECODED_VARIANT_KEY in item:
# Decoded support from PRs #3971/#3988 (frontend decoding + NIXL transfer)
# Will contain NIXL metadata for direct memory access
# TODO: Implement NIXL read when PRs merge
logger.warning(
"Decoded multimodal data not yet supported in standard worker"
)
if images:
# vLLM expects single image or list
vllm_mm_data["image"] = images[0] if len(images) == 1 else images
logger.debug(f"Extracted {len(images)} image(s) for multimodal processing")
# Handle video_url entries (future expansion)
if VIDEO_URL_KEY in mm_map:
logger.warning("Video multimodal data not yet supported in standard worker")
return vllm_mm_data if vllm_mm_data else None
async def generate_tokens( async def generate_tokens(
self, prompt, sampling_params, request_id, data_parallel_rank=None self, prompt, sampling_params, request_id, data_parallel_rank=None
): ):
...@@ -168,7 +220,12 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -168,7 +220,12 @@ class DecodeWorkerHandler(BaseWorkerHandler):
request_id = context.id() request_id = context.id()
logger.debug(f"Decode Request ID: {request_id}") logger.debug(f"Decode Request ID: {request_id}")
prompt = TokensPrompt(prompt_token_ids=request["token_ids"]) # Extract and decode multimodal data if present
multi_modal_data = await self._extract_multimodal_data(request)
prompt = TokensPrompt(
prompt_token_ids=request["token_ids"], multi_modal_data=multi_modal_data
)
# Build sampling params from request # Build sampling params from request
sampling_params = build_sampling_params(request, self.default_sampling_params) sampling_params = build_sampling_params(request, self.default_sampling_params)
...@@ -210,8 +267,13 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -210,8 +267,13 @@ class PrefillWorkerHandler(BaseWorkerHandler):
request_id = context.id() request_id = context.id()
logger.debug(f"Prefill Request ID: {request_id}") logger.debug(f"Prefill Request ID: {request_id}")
# Extract and decode multimodal data if present
multi_modal_data = await self._extract_multimodal_data(request)
token_ids = request["token_ids"] token_ids = request["token_ids"]
prompt = TokensPrompt(prompt_token_ids=token_ids) prompt = TokensPrompt(
prompt_token_ids=token_ids, multi_modal_data=multi_modal_data
)
# Build sampling params from request using shared utility # Build sampling params from request using shared utility
sampling_params = build_sampling_params(request, self.default_sampling_params) sampling_params = build_sampling_params(request, self.default_sampling_params)
......
#!/bin/bash #!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# Default values # Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf" MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
PROVIDED_PROMPT_TEMPLATE=""
# Parse command line arguments # Parse command line arguments
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
...@@ -16,15 +24,10 @@ while [[ $# -gt 0 ]]; do ...@@ -16,15 +24,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME=$2 MODEL_NAME=$2
shift 2 shift 2
;; ;;
--prompt-template)
PROVIDED_PROMPT_TEMPLATE=$2
shift 2
;;
-h|--help) -h|--help)
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "Options:" echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)" echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
exit 0 exit 0
;; ;;
...@@ -36,37 +39,23 @@ while [[ $# -gt 0 ]]; do ...@@ -36,37 +39,23 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
# Set PROMPT_TEMPLATE based on the MODEL_NAME # Start frontend with Rust OpenAIPreprocessor
if [[ -n "$PROVIDED_PROMPT_TEMPLATE" ]]; then
PROMPT_TEMPLATE="$PROVIDED_PROMPT_TEMPLATE"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else
echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
echo "Please provide a prompt template using --prompt-template option."
echo "Example: --prompt-template 'USER: <image>\n<prompt> ASSISTANT:'"
exit 1
fi
# run ingress
python -m dynamo.frontend --http-port=8000 & python -m dynamo.frontend --http-port=8000 &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments # Configure GPU memory optimization for specific models
EXTRA_ARGS="" EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048" EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
fi fi
# run processor # Start vLLM worker with vision model
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" & # Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production)
# run E/P/D workers # --connector none: No KV transfer needed for aggregated serving
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME & DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS & python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
# Wait for all background processes to complete # Wait for all background processes to complete
wait wait
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# EPD (Encode-Prefill-Decode) multimodal deployment
#
# Architecture: 3-component disaggregation
# - Processor: Python-based preprocessor (bypasses Rust OpenAIPreprocessor)
# - Encode Worker: Dedicated vision encoder that extracts image embeddings
# - PD Worker: Standard prefill/decode worker that receives embeddings via NIXL
#
# Benefits: Decouples encoding from inference, enables independent scaling
# For standard single-worker deployment, see agg_multimodal.sh
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
PROVIDED_PROMPT_TEMPLATE=""
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--prompt-template)
PROVIDED_PROMPT_TEMPLATE=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model <model_name> Specify the model to use (default: $MODEL_NAME)"
echo " --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Set PROMPT_TEMPLATE based on the MODEL_NAME
if [[ -n "$PROVIDED_PROMPT_TEMPLATE" ]]; then
PROMPT_TEMPLATE="$PROVIDED_PROMPT_TEMPLATE"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
else
echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
echo "Please provide a prompt template using --prompt-template option."
echo "Example: --prompt-template 'USER: <image>\n<prompt> ASSISTANT:'"
exit 1
fi
# Start frontend (HTTP endpoint)
python -m dynamo.frontend --http-port=8000 &
# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
fi
# Start processor (Python-based preprocessing, handles prompt templating)
python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
# run E/P/D workers
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME &
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS &
# Wait for all background processes to complete
wait
...@@ -1457,6 +1457,7 @@ dependencies = [ ...@@ -1457,6 +1457,7 @@ dependencies = [
"async_zmq", "async_zmq",
"axum", "axum",
"axum-server", "axum-server",
"base64 0.22.1",
"bincode 2.0.1", "bincode 2.0.1",
"bitflags 2.9.3", "bitflags 2.9.3",
"blake3", "blake3",
...@@ -1494,6 +1495,7 @@ dependencies = [ ...@@ -1494,6 +1495,7 @@ dependencies = [
"rand 0.9.2", "rand 0.9.2",
"rayon", "rayon",
"regex", "regex",
"reqwest",
"rmp-serde", "rmp-serde",
"rustls", "rustls",
"serde", "serde",
......
...@@ -104,10 +104,10 @@ vllm_configs = { ...@@ -104,10 +104,10 @@ vllm_configs = {
completion_payload_default(expected_response=["joke"]), completion_payload_default(expected_response=["joke"]),
], ],
), ),
"multimodal_agg_llava": VLLMConfig( "multimodal_agg_llava_epd": VLLMConfig(
name="multimodal_agg_llava", name="multimodal_agg_llava_epd",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_multimodal.sh", script_name="agg_multimodal_epd.sh",
marks=[pytest.mark.gpu_2], marks=[pytest.mark.gpu_2],
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
script_args=["--model", "llava-hf/llava-1.5-7b-hf"], script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
...@@ -128,16 +128,42 @@ vllm_configs = { ...@@ -128,16 +128,42 @@ vllm_configs = {
) )
], ],
), ),
"multimodal_agg_qwen_epd": VLLMConfig(
name="multimodal_agg_qwen_epd",
directory=vllm_dir,
script_name="agg_multimodal_epd.sh",
marks=[pytest.mark.gpu_2],
model="Qwen/Qwen2.5-VL-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
timeout=360,
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
},
},
],
repeat_count=1,
expected_response=["bus"],
)
],
),
"multimodal_agg_qwen": VLLMConfig( "multimodal_agg_qwen": VLLMConfig(
name="multimodal_agg_qwen", name="multimodal_agg_qwen",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_multimodal.sh", script_name="agg_multimodal.sh",
marks=[pytest.mark.gpu_2], marks=[pytest.mark.gpu_2],
model="Qwen/Qwen2.5-VL-7B-Instruct", model="Qwen/Qwen2.5-VL-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"], script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
timeout=360, timeout=360,
request_payloads=[ request_payloads=[
# HTTP URL test
chat_payload( chat_payload(
[ [
{"type": "text", "text": "What is in this image?"}, {"type": "text", "text": "What is in this image?"},
...@@ -150,7 +176,21 @@ vllm_configs = { ...@@ -150,7 +176,21 @@ vllm_configs = {
], ],
repeat_count=1, repeat_count=1,
expected_response=["bus"], expected_response=["bus"],
) ),
# Base64 data URL test (1x1 PNG inline, avoids network fetch)
chat_payload(
[
{"type": "text", "text": "What do you see in this image?"},
{
"type": "image_url",
"image_url": {
"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
},
},
],
repeat_count=1,
expected_response=[], # Just validate no error
),
], ],
), ),
# TODO: Update this test case when we have video multimodal support in vllm official components # TODO: Update this test case when we have video multimodal support in vllm official components
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment