Unverified Commit 1182e207 authored by Ayush Agarwal's avatar Ayush Agarwal Committed by GitHub
Browse files

feat: vllm omni image to video support (#6530)


Signed-off-by: default avatarayushag <ayushag@nvidia.com>
parent 5c7e66ec
......@@ -33,6 +33,7 @@ from .http_client import get_http_client
logger = logging.getLogger(__name__)
# Constants for multimodal data variants
URL_VARIANT_KEY: Final = "Url"
DECODED_VARIANT_KEY: Final = "Decoded"
......@@ -87,6 +88,16 @@ class ImageLoader:
raise ValueError("Empty response content from image URL")
image_data = BytesIO(response.content)
elif parsed_url.scheme in ("", "file"):
# Local file path (plain path or file:// URI)
path = image_url if parsed_url.scheme == "" else parsed_url.path
def _read_local_file(p: str) -> bytes:
with open(p, "rb") as f:
return f.read()
image_bytes = await asyncio.to_thread(_read_local_file, path)
image_data = BytesIO(image_bytes)
else:
raise ValueError(f"Invalid image source scheme: {parsed_url.scheme}")
......
......@@ -40,6 +40,12 @@ class VideoNvExt(BaseModel):
seed: Optional[int] = None
"""Random seed for reproducibility."""
boundary_ratio: Optional[float] = None
"""MoE expert switching boundary as a fraction of the denoising schedule (vLLM-Omni I2V)."""
guidance_scale_2: Optional[float] = None
"""CFG scale for the low-noise expert (vLLM-Omni I2V dual-guidance)."""
class NvCreateVideoRequest(BaseModel):
"""Request for video generation (/v1/videos endpoint).
......
......@@ -10,10 +10,12 @@ from dataclasses import dataclass
from io import BytesIO
from typing import Any, AsyncGenerator, Dict, Optional, Union
import PIL.Image
from diffusers.utils import export_to_video
from fsspec.implementations.dirfs import DirFileSystem
from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniTextPrompt
from dynamo.common.multimodal import ImageLoader
from dynamo.common.protocols.image_protocol import (
ImageData,
NvCreateImageRequest,
......@@ -94,6 +96,7 @@ class OmniHandler(BaseOmniHandler):
)
self.media_output_fs = media_output_fs
self.media_output_http_url = media_output_http_url
self._image_loader = ImageLoader()
async def generate(
self, request: Dict[str, Any], context
......@@ -121,7 +124,30 @@ class OmniHandler(BaseOmniHandler):
parsed_request, request_type = parse_request_type(
request, self.config.output_modalities
)
inputs = self.build_engine_inputs(parsed_request, request_type)
# Pre-load input image for I2V requests (async I/O before sync build)
image = None
if (
request_type == RequestType.VIDEO_GENERATION
and isinstance(parsed_request, NvCreateVideoRequest)
and parsed_request.input_reference
):
try:
image = await self._image_loader.load_image(
parsed_request.input_reference
)
except Exception as e:
logger.warning("Failed to load I2V input_reference: %s", e)
yield {
"id": request_id,
"object": "video",
"model": self.config.model,
"status": "failed",
"error": f"Failed to load input_reference: {e}",
}
return
inputs = self.build_engine_inputs(parsed_request, request_type, image=image)
generate_kwargs: Dict[str, Any] = {
"prompt": inputs.prompt,
......@@ -187,6 +213,7 @@ class OmniHandler(BaseOmniHandler):
NvCreateImageRequest, NvCreateVideoRequest, Dict[str, Any]
],
request_type: RequestType,
image: PIL.Image.Image | None = None,
) -> EngineInputs:
"""Convert a parsed request into AsyncOmni engine inputs.
......@@ -194,6 +221,7 @@ class OmniHandler(BaseOmniHandler):
parsed_request: Output from parse_request_type -- a Pydantic model
for image/video requests, or a raw dict for chat completions.
request_type: The RequestType determined by parse_request_type.
image: Pre-loaded PIL Image for I2V requests (from input_reference).
Returns:
EngineInputs ready for engine_client.generate().
......@@ -203,7 +231,7 @@ class OmniHandler(BaseOmniHandler):
elif request_type == RequestType.IMAGE_GENERATION:
return self._engine_inputs_from_image(parsed_request)
elif request_type == RequestType.VIDEO_GENERATION:
return self._engine_inputs_from_video(parsed_request)
return self._engine_inputs_from_video(parsed_request, image=image)
elif request_type == RequestType.AUDIO_GENERATION:
raise NotImplementedError("Audio generation is not yet supported")
......@@ -264,8 +292,19 @@ class OmniHandler(BaseOmniHandler):
response_format=req.response_format,
)
def _engine_inputs_from_video(self, req: NvCreateVideoRequest) -> EngineInputs:
"""Build engine inputs from an NvCreateVideoRequest."""
def _engine_inputs_from_video(
self,
req: NvCreateVideoRequest,
image: PIL.Image.Image | None = None,
) -> EngineInputs:
"""Build engine inputs from an NvCreateVideoRequest.
Args:
req: Parsed video generation request.
image: Pre-loaded PIL Image for I2V. When provided, the image is
attached to the prompt via ``multi_modal_data`` so vllm-omni's
I2V pipeline pre-process can use it.
"""
width, height = parse_size(req.size)
nvext = req.nvext
......@@ -287,6 +326,14 @@ class OmniHandler(BaseOmniHandler):
else None,
)
if image is not None:
prompt["multi_modal_data"] = {"image": image}
logger.info(
"I2V: attached image (%dx%d) to multi_modal_data",
image.size[0],
image.size[1],
)
sp = OmniDiffusionSamplingParams(
height=height,
width=width,
......@@ -299,6 +346,10 @@ class OmniHandler(BaseOmniHandler):
sp.guidance_scale = nvext.guidance_scale
if nvext.seed is not None:
sp.seed = nvext.seed
if nvext.boundary_ratio is not None:
sp.boundary_ratio = nvext.boundary_ratio
if nvext.guidance_scale_2 is not None:
sp.guidance_scale_2 = nvext.guidance_scale_2
if fps is not None:
sp.fps = fps
......
......@@ -6,8 +6,10 @@ from unittest.mock import MagicMock, patch
import pytest
try:
from PIL import Image
from dynamo.common.protocols.image_protocol import NvCreateImageRequest
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest, VideoNvExt
from dynamo.common.utils.output_modalities import RequestType
from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler
except ImportError:
......@@ -247,3 +249,60 @@ class TestFormatVideoChunk:
chunk = await handler._format_video_chunk([MagicMock()], "req-1", fps=16)
assert chunk["status"] == "failed"
assert "boom" in chunk["error"]
class TestI2VEngineInputs:
"""Tests for image-to-video: multi_modal_data attachment, I2V nvext params, and protocol fields."""
def test_t2v_no_multi_modal_data_and_i2v_attaches_image(self):
"""T2V has no multi_modal_data; I2V attaches image to prompt."""
handler = _make_handler()
req = NvCreateVideoRequest(
prompt="a drone", model="test", size="832x480", seconds=2
)
# T2V: no image
t2v = handler.build_engine_inputs(req, RequestType.VIDEO_GENERATION)
assert "multi_modal_data" not in t2v.prompt
# I2V: image attached
img = Image.new("RGB", (64, 64), color="red")
i2v = handler.build_engine_inputs(req, RequestType.VIDEO_GENERATION, image=img)
assert i2v.prompt["multi_modal_data"]["image"] is img
def test_i2v_nvext_params_on_sampling_params(self):
"""boundary_ratio and guidance_scale_2 are forwarded to sampling params."""
handler = _make_handler()
req = NvCreateVideoRequest(
prompt="bear",
model="test",
size="832x480",
nvext=VideoNvExt(
boundary_ratio=0.875, guidance_scale_2=1.0, num_inference_steps=40
),
)
sp = handler.build_engine_inputs(
req, RequestType.VIDEO_GENERATION
).sampling_params_list[0]
assert sp.boundary_ratio == 0.875
assert sp.guidance_scale_2 == 1.0
assert sp.num_inference_steps == 40
def test_i2v_protocol_roundtrip(self):
"""VideoNvExt and NvCreateVideoRequest serialize/deserialize I2V fields correctly."""
req = NvCreateVideoRequest(
prompt="bear playing",
model="Wan-AI/Wan2.2-TI2V-5B-Diffusers",
input_reference="/tmp/bear.png",
size="832x480",
nvext=VideoNvExt(boundary_ratio=0.9, guidance_scale_2=2.0, seed=42),
)
data = req.model_dump()
assert data["input_reference"] == "/tmp/bear.png"
assert data["nvext"]["boundary_ratio"] == 0.9
assert data["nvext"]["guidance_scale_2"] == 2.0
# Defaults are None
empty = VideoNvExt()
assert empty.boundary_ratio is None
assert empty.guidance_scale_2 is None
......@@ -25,6 +25,7 @@ pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1
| Text-to-Text | `/v1/chat/completions` | `text` (default) |
| Text-to-Image | `/v1/chat/completions`, `/v1/images/generations` | `image` |
| Text-to-Video | `/v1/videos` | `video` |
| Image-to-Video | `/v1/videos` | `video` |
The `--output-modalities` flag determines which endpoint(s) the worker registers. When set to `image`, both `/v1/chat/completions` (returns inline base64 images) and `/v1/images/generations` are available. When set to `video`, the worker serves `/v1/videos`.
......@@ -35,6 +36,7 @@ The `--output-modalities` flag determines which endpoint(s) the worker registers
| Text-to-Text | `Qwen/Qwen2.5-Omni-7B` |
| Text-to-Image | `Qwen/Qwen-Image`, `AIDC-AI/Ovis-Image-7B` |
| Text-to-Video | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
| Image-to-Video | `Wan-AI/Wan2.2-TI2V-5B-Diffusers`, `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
To run a non-default model, pass `--model` to any launch script:
......@@ -159,6 +161,47 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f
| `nvext.num_inference_steps` | Number of denoising steps | 50 |
| `nvext.guidance_scale` | CFG guidance scale | 5.0 |
| `nvext.seed` | Random seed for reproducibility | -- |
| `nvext.boundary_ratio` | MoE expert switching boundary (I2V) | 0.875 |
| `nvext.guidance_scale_2` | CFG scale for low-noise expert (I2V) | 1.0 |
## Image-to-Video
Image-to-video (I2V) uses the same `/v1/videos` endpoint as text-to-video, with an additional `input_reference` field that provides the source image. The image can be an HTTP URL, a base64 data URI, or a local file path.
Launch with the provided script using `Wan-AI/Wan2.2-TI2V-5B-Diffusers`:
```bash
bash examples/backends/vllm/launch/agg_omni_i2v.sh
```
Generate a video from an image:
```bash
curl -s http://localhost:8000/v1/videos \
-H "Content-Type: application/json" \
-d '{
"model": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
"prompt": "A bear playing with yarn, smooth motion",
"input_reference": "https://example.com/bear.png",
"size": "832x480",
"response_format": "url",
"nvext": {
"num_inference_steps": 40,
"num_frames": 33,
"guidance_scale": 1.0,
"boundary_ratio": 0.875,
"guidance_scale_2": 1.0,
"seed": 42
}
}'
```
The `input_reference` field accepts:
- **HTTP/HTTPS URL**: `"https://example.com/image.png"`
- **Base64 data URI**: `"data:image/png;base64,iVBORw0KGgo..."`
- **Local file path**: `"/path/to/image.png"` or `"file:///path/to/image.png"`
The I2V-specific `nvext` fields (`boundary_ratio`, `guidance_scale_2`) control the dual-expert MoE denoising schedule in Wan2.x models. See [Wan2.2-I2V model card](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers) for details.
## CLI Reference
......@@ -192,6 +235,6 @@ Omni pipelines are configured via YAML stage configs. See [`examples/backends/vl
## Current Limitations
- Only text prompts are supported as input (no multimodal input yet).
- Image input is supported only for I2V via `input_reference` in `/v1/videos`. Other endpoints accept text prompts only.
- KV cache events are not published for omni workers.
- Each worker supports a single output modality at a time.
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Launch an aggregated vLLM-Omni deployment for image-to-video (I2V).
#
# Usage:
# bash agg_omni_i2v.sh [OPTIONS]
#
# Options:
# --model <model> Model to use (default: Wan-AI/Wan2.2-TI2V-5B-Diffusers)
# Any other flags are forwarded to the vLLM worker.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
MODEL="Wan-AI/Wan2.2-TI2V-5B-Diffusers"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
if [[ $# -lt 2 || "$2" == --* ]]; then
echo "Error: --model requires a value" >&2
exit 1
fi
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
echo "=========================================="
echo "Starting vLLM-Omni I2V Worker"
echo "Model: $MODEL"
echo "=========================================="
echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
python -m dynamo.frontend &
FRONTEND_PID=$!
sleep 2
echo "Starting Omni worker..."
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm \
--model "$MODEL" \
--omni \
--output-modalities video \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}"
......@@ -50,6 +50,16 @@ pub struct NvExt {
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub seed: Option<i64>,
/// MoE expert switching boundary as a fraction of the denoising schedule (vLLM-Omni I2V).
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub boundary_ratio: Option<f32>,
/// CFG scale for the low-noise expert (vLLM-Omni I2V dual-guidance).
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub guidance_scale_2: Option<f32>,
}
impl Default for NvExt {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment