Unverified Commit 1182e207 authored by Ayush Agarwal's avatar Ayush Agarwal Committed by GitHub
Browse files

feat: vllm omni image to video support (#6530)


Signed-off-by: default avatarayushag <ayushag@nvidia.com>
parent 5c7e66ec
...@@ -33,6 +33,7 @@ from .http_client import get_http_client ...@@ -33,6 +33,7 @@ from .http_client import get_http_client
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Constants for multimodal data variants # Constants for multimodal data variants
URL_VARIANT_KEY: Final = "Url" URL_VARIANT_KEY: Final = "Url"
DECODED_VARIANT_KEY: Final = "Decoded" DECODED_VARIANT_KEY: Final = "Decoded"
...@@ -87,6 +88,16 @@ class ImageLoader: ...@@ -87,6 +88,16 @@ class ImageLoader:
raise ValueError("Empty response content from image URL") raise ValueError("Empty response content from image URL")
image_data = BytesIO(response.content) image_data = BytesIO(response.content)
elif parsed_url.scheme in ("", "file"):
# Local file path (plain path or file:// URI)
path = image_url if parsed_url.scheme == "" else parsed_url.path
def _read_local_file(p: str) -> bytes:
with open(p, "rb") as f:
return f.read()
image_bytes = await asyncio.to_thread(_read_local_file, path)
image_data = BytesIO(image_bytes)
else: else:
raise ValueError(f"Invalid image source scheme: {parsed_url.scheme}") raise ValueError(f"Invalid image source scheme: {parsed_url.scheme}")
......
...@@ -40,6 +40,12 @@ class VideoNvExt(BaseModel): ...@@ -40,6 +40,12 @@ class VideoNvExt(BaseModel):
seed: Optional[int] = None seed: Optional[int] = None
"""Random seed for reproducibility.""" """Random seed for reproducibility."""
boundary_ratio: Optional[float] = None
"""MoE expert switching boundary as a fraction of the denoising schedule (vLLM-Omni I2V)."""
guidance_scale_2: Optional[float] = None
"""CFG scale for the low-noise expert (vLLM-Omni I2V dual-guidance)."""
class NvCreateVideoRequest(BaseModel): class NvCreateVideoRequest(BaseModel):
"""Request for video generation (/v1/videos endpoint). """Request for video generation (/v1/videos endpoint).
......
...@@ -10,10 +10,12 @@ from dataclasses import dataclass ...@@ -10,10 +10,12 @@ from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Any, AsyncGenerator, Dict, Optional, Union from typing import Any, AsyncGenerator, Dict, Optional, Union
import PIL.Image
from diffusers.utils import export_to_video from diffusers.utils import export_to_video
from fsspec.implementations.dirfs import DirFileSystem from fsspec.implementations.dirfs import DirFileSystem
from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniTextPrompt from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniTextPrompt
from dynamo.common.multimodal import ImageLoader
from dynamo.common.protocols.image_protocol import ( from dynamo.common.protocols.image_protocol import (
ImageData, ImageData,
NvCreateImageRequest, NvCreateImageRequest,
...@@ -94,6 +96,7 @@ class OmniHandler(BaseOmniHandler): ...@@ -94,6 +96,7 @@ class OmniHandler(BaseOmniHandler):
) )
self.media_output_fs = media_output_fs self.media_output_fs = media_output_fs
self.media_output_http_url = media_output_http_url self.media_output_http_url = media_output_http_url
self._image_loader = ImageLoader()
async def generate( async def generate(
self, request: Dict[str, Any], context self, request: Dict[str, Any], context
...@@ -121,7 +124,30 @@ class OmniHandler(BaseOmniHandler): ...@@ -121,7 +124,30 @@ class OmniHandler(BaseOmniHandler):
parsed_request, request_type = parse_request_type( parsed_request, request_type = parse_request_type(
request, self.config.output_modalities request, self.config.output_modalities
) )
inputs = self.build_engine_inputs(parsed_request, request_type)
# Pre-load input image for I2V requests (async I/O before sync build)
image = None
if (
request_type == RequestType.VIDEO_GENERATION
and isinstance(parsed_request, NvCreateVideoRequest)
and parsed_request.input_reference
):
try:
image = await self._image_loader.load_image(
parsed_request.input_reference
)
except Exception as e:
logger.warning("Failed to load I2V input_reference: %s", e)
yield {
"id": request_id,
"object": "video",
"model": self.config.model,
"status": "failed",
"error": f"Failed to load input_reference: {e}",
}
return
inputs = self.build_engine_inputs(parsed_request, request_type, image=image)
generate_kwargs: Dict[str, Any] = { generate_kwargs: Dict[str, Any] = {
"prompt": inputs.prompt, "prompt": inputs.prompt,
...@@ -187,6 +213,7 @@ class OmniHandler(BaseOmniHandler): ...@@ -187,6 +213,7 @@ class OmniHandler(BaseOmniHandler):
NvCreateImageRequest, NvCreateVideoRequest, Dict[str, Any] NvCreateImageRequest, NvCreateVideoRequest, Dict[str, Any]
], ],
request_type: RequestType, request_type: RequestType,
image: PIL.Image.Image | None = None,
) -> EngineInputs: ) -> EngineInputs:
"""Convert a parsed request into AsyncOmni engine inputs. """Convert a parsed request into AsyncOmni engine inputs.
...@@ -194,6 +221,7 @@ class OmniHandler(BaseOmniHandler): ...@@ -194,6 +221,7 @@ class OmniHandler(BaseOmniHandler):
parsed_request: Output from parse_request_type -- a Pydantic model parsed_request: Output from parse_request_type -- a Pydantic model
for image/video requests, or a raw dict for chat completions. for image/video requests, or a raw dict for chat completions.
request_type: The RequestType determined by parse_request_type. request_type: The RequestType determined by parse_request_type.
image: Pre-loaded PIL Image for I2V requests (from input_reference).
Returns: Returns:
EngineInputs ready for engine_client.generate(). EngineInputs ready for engine_client.generate().
...@@ -203,7 +231,7 @@ class OmniHandler(BaseOmniHandler): ...@@ -203,7 +231,7 @@ class OmniHandler(BaseOmniHandler):
elif request_type == RequestType.IMAGE_GENERATION: elif request_type == RequestType.IMAGE_GENERATION:
return self._engine_inputs_from_image(parsed_request) return self._engine_inputs_from_image(parsed_request)
elif request_type == RequestType.VIDEO_GENERATION: elif request_type == RequestType.VIDEO_GENERATION:
return self._engine_inputs_from_video(parsed_request) return self._engine_inputs_from_video(parsed_request, image=image)
elif request_type == RequestType.AUDIO_GENERATION: elif request_type == RequestType.AUDIO_GENERATION:
raise NotImplementedError("Audio generation is not yet supported") raise NotImplementedError("Audio generation is not yet supported")
...@@ -264,8 +292,19 @@ class OmniHandler(BaseOmniHandler): ...@@ -264,8 +292,19 @@ class OmniHandler(BaseOmniHandler):
response_format=req.response_format, response_format=req.response_format,
) )
def _engine_inputs_from_video(self, req: NvCreateVideoRequest) -> EngineInputs: def _engine_inputs_from_video(
"""Build engine inputs from an NvCreateVideoRequest.""" self,
req: NvCreateVideoRequest,
image: PIL.Image.Image | None = None,
) -> EngineInputs:
"""Build engine inputs from an NvCreateVideoRequest.
Args:
req: Parsed video generation request.
image: Pre-loaded PIL Image for I2V. When provided, the image is
attached to the prompt via ``multi_modal_data`` so vllm-omni's
I2V pipeline pre-process can use it.
"""
width, height = parse_size(req.size) width, height = parse_size(req.size)
nvext = req.nvext nvext = req.nvext
...@@ -287,6 +326,14 @@ class OmniHandler(BaseOmniHandler): ...@@ -287,6 +326,14 @@ class OmniHandler(BaseOmniHandler):
else None, else None,
) )
if image is not None:
prompt["multi_modal_data"] = {"image": image}
logger.info(
"I2V: attached image (%dx%d) to multi_modal_data",
image.size[0],
image.size[1],
)
sp = OmniDiffusionSamplingParams( sp = OmniDiffusionSamplingParams(
height=height, height=height,
width=width, width=width,
...@@ -299,6 +346,10 @@ class OmniHandler(BaseOmniHandler): ...@@ -299,6 +346,10 @@ class OmniHandler(BaseOmniHandler):
sp.guidance_scale = nvext.guidance_scale sp.guidance_scale = nvext.guidance_scale
if nvext.seed is not None: if nvext.seed is not None:
sp.seed = nvext.seed sp.seed = nvext.seed
if nvext.boundary_ratio is not None:
sp.boundary_ratio = nvext.boundary_ratio
if nvext.guidance_scale_2 is not None:
sp.guidance_scale_2 = nvext.guidance_scale_2
if fps is not None: if fps is not None:
sp.fps = fps sp.fps = fps
......
...@@ -6,8 +6,10 @@ from unittest.mock import MagicMock, patch ...@@ -6,8 +6,10 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
try: try:
from PIL import Image
from dynamo.common.protocols.image_protocol import NvCreateImageRequest from dynamo.common.protocols.image_protocol import NvCreateImageRequest
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest from dynamo.common.protocols.video_protocol import NvCreateVideoRequest, VideoNvExt
from dynamo.common.utils.output_modalities import RequestType from dynamo.common.utils.output_modalities import RequestType
from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler
except ImportError: except ImportError:
...@@ -247,3 +249,60 @@ class TestFormatVideoChunk: ...@@ -247,3 +249,60 @@ class TestFormatVideoChunk:
chunk = await handler._format_video_chunk([MagicMock()], "req-1", fps=16) chunk = await handler._format_video_chunk([MagicMock()], "req-1", fps=16)
assert chunk["status"] == "failed" assert chunk["status"] == "failed"
assert "boom" in chunk["error"] assert "boom" in chunk["error"]
class TestI2VEngineInputs:
"""Tests for image-to-video: multi_modal_data attachment, I2V nvext params, and protocol fields."""
def test_t2v_no_multi_modal_data_and_i2v_attaches_image(self):
"""T2V has no multi_modal_data; I2V attaches image to prompt."""
handler = _make_handler()
req = NvCreateVideoRequest(
prompt="a drone", model="test", size="832x480", seconds=2
)
# T2V: no image
t2v = handler.build_engine_inputs(req, RequestType.VIDEO_GENERATION)
assert "multi_modal_data" not in t2v.prompt
# I2V: image attached
img = Image.new("RGB", (64, 64), color="red")
i2v = handler.build_engine_inputs(req, RequestType.VIDEO_GENERATION, image=img)
assert i2v.prompt["multi_modal_data"]["image"] is img
def test_i2v_nvext_params_on_sampling_params(self):
"""boundary_ratio and guidance_scale_2 are forwarded to sampling params."""
handler = _make_handler()
req = NvCreateVideoRequest(
prompt="bear",
model="test",
size="832x480",
nvext=VideoNvExt(
boundary_ratio=0.875, guidance_scale_2=1.0, num_inference_steps=40
),
)
sp = handler.build_engine_inputs(
req, RequestType.VIDEO_GENERATION
).sampling_params_list[0]
assert sp.boundary_ratio == 0.875
assert sp.guidance_scale_2 == 1.0
assert sp.num_inference_steps == 40
def test_i2v_protocol_roundtrip(self):
"""VideoNvExt and NvCreateVideoRequest serialize/deserialize I2V fields correctly."""
req = NvCreateVideoRequest(
prompt="bear playing",
model="Wan-AI/Wan2.2-TI2V-5B-Diffusers",
input_reference="/tmp/bear.png",
size="832x480",
nvext=VideoNvExt(boundary_ratio=0.9, guidance_scale_2=2.0, seed=42),
)
data = req.model_dump()
assert data["input_reference"] == "/tmp/bear.png"
assert data["nvext"]["boundary_ratio"] == 0.9
assert data["nvext"]["guidance_scale_2"] == 2.0
# Defaults are None
empty = VideoNvExt()
assert empty.boundary_ratio is None
assert empty.guidance_scale_2 is None
...@@ -25,6 +25,7 @@ pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1 ...@@ -25,6 +25,7 @@ pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1
| Text-to-Text | `/v1/chat/completions` | `text` (default) | | Text-to-Text | `/v1/chat/completions` | `text` (default) |
| Text-to-Image | `/v1/chat/completions`, `/v1/images/generations` | `image` | | Text-to-Image | `/v1/chat/completions`, `/v1/images/generations` | `image` |
| Text-to-Video | `/v1/videos` | `video` | | Text-to-Video | `/v1/videos` | `video` |
| Image-to-Video | `/v1/videos` | `video` |
The `--output-modalities` flag determines which endpoint(s) the worker registers. When set to `image`, both `/v1/chat/completions` (returns inline base64 images) and `/v1/images/generations` are available. When set to `video`, the worker serves `/v1/videos`. The `--output-modalities` flag determines which endpoint(s) the worker registers. When set to `image`, both `/v1/chat/completions` (returns inline base64 images) and `/v1/images/generations` are available. When set to `video`, the worker serves `/v1/videos`.
...@@ -35,6 +36,7 @@ The `--output-modalities` flag determines which endpoint(s) the worker registers ...@@ -35,6 +36,7 @@ The `--output-modalities` flag determines which endpoint(s) the worker registers
| Text-to-Text | `Qwen/Qwen2.5-Omni-7B` | | Text-to-Text | `Qwen/Qwen2.5-Omni-7B` |
| Text-to-Image | `Qwen/Qwen-Image`, `AIDC-AI/Ovis-Image-7B` | | Text-to-Image | `Qwen/Qwen-Image`, `AIDC-AI/Ovis-Image-7B` |
| Text-to-Video | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | | Text-to-Video | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
| Image-to-Video | `Wan-AI/Wan2.2-TI2V-5B-Diffusers`, `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
To run a non-default model, pass `--model` to any launch script: To run a non-default model, pass `--model` to any launch script:
...@@ -159,6 +161,47 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f ...@@ -159,6 +161,47 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f
| `nvext.num_inference_steps` | Number of denoising steps | 50 | | `nvext.num_inference_steps` | Number of denoising steps | 50 |
| `nvext.guidance_scale` | CFG guidance scale | 5.0 | | `nvext.guidance_scale` | CFG guidance scale | 5.0 |
| `nvext.seed` | Random seed for reproducibility | -- | | `nvext.seed` | Random seed for reproducibility | -- |
| `nvext.boundary_ratio` | MoE expert switching boundary (I2V) | 0.875 |
| `nvext.guidance_scale_2` | CFG scale for low-noise expert (I2V) | 1.0 |
## Image-to-Video
Image-to-video (I2V) uses the same `/v1/videos` endpoint as text-to-video, with an additional `input_reference` field that provides the source image. The image can be an HTTP URL, a base64 data URI, or a local file path.
Launch with the provided script using `Wan-AI/Wan2.2-TI2V-5B-Diffusers`:
```bash
bash examples/backends/vllm/launch/agg_omni_i2v.sh
```
Generate a video from an image:
```bash
curl -s http://localhost:8000/v1/videos \
-H "Content-Type: application/json" \
-d '{
"model": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
"prompt": "A bear playing with yarn, smooth motion",
"input_reference": "https://example.com/bear.png",
"size": "832x480",
"response_format": "url",
"nvext": {
"num_inference_steps": 40,
"num_frames": 33,
"guidance_scale": 1.0,
"boundary_ratio": 0.875,
"guidance_scale_2": 1.0,
"seed": 42
}
}'
```
The `input_reference` field accepts:
- **HTTP/HTTPS URL**: `"https://example.com/image.png"`
- **Base64 data URI**: `"data:image/png;base64,iVBORw0KGgo..."`
- **Local file path**: `"/path/to/image.png"` or `"file:///path/to/image.png"`
The I2V-specific `nvext` fields (`boundary_ratio`, `guidance_scale_2`) control the dual-expert MoE denoising schedule in Wan2.x models. See [Wan2.2-I2V model card](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers) for details.
## CLI Reference ## CLI Reference
...@@ -192,6 +235,6 @@ Omni pipelines are configured via YAML stage configs. See [`examples/backends/vl ...@@ -192,6 +235,6 @@ Omni pipelines are configured via YAML stage configs. See [`examples/backends/vl
## Current Limitations ## Current Limitations
- Only text prompts are supported as input (no multimodal input yet). - Image input is supported only for I2V via `input_reference` in `/v1/videos`. Other endpoints accept text prompts only.
- KV cache events are not published for omni workers. - KV cache events are not published for omni workers.
- Each worker supports a single output modality at a time. - Each worker supports a single output modality at a time.
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Launch an aggregated vLLM-Omni deployment for image-to-video (I2V).
#
# Usage:
# bash agg_omni_i2v.sh [OPTIONS]
#
# Options:
# --model <model> Model to use (default: Wan-AI/Wan2.2-TI2V-5B-Diffusers)
# Any other flags are forwarded to the vLLM worker.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Wan-AI/Wan2.2-TI2V-5B-Diffusers"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
if [[ $# -lt 2 || "$2" == --* ]]; then
echo "Error: --model requires a value" >&2
exit 1
fi
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
echo "=========================================="
echo "Starting vLLM-Omni I2V Worker"
echo "Model: $MODEL"
echo "=========================================="
echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
python -m dynamo.frontend &
FRONTEND_PID=$!
sleep 2
echo "Starting Omni worker..."
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm \
--model "$MODEL" \
--omni \
--output-modalities video \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}"
...@@ -50,6 +50,16 @@ pub struct NvExt { ...@@ -50,6 +50,16 @@ pub struct NvExt {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))] #[builder(default, setter(strip_option))]
pub seed: Option<i64>, pub seed: Option<i64>,
/// MoE expert switching boundary as a fraction of the denoising schedule (vLLM-Omni I2V).
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub boundary_ratio: Option<f32>,
/// CFG scale for the low-noise expert (vLLM-Omni I2V dual-guidance).
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub guidance_scale_2: Option<f32>,
} }
impl Default for NvExt { impl Default for NvExt {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment