feat: vllm omni image to video support (#6530)

Signed-off-by: ayushag <ayushag@nvidia.com>

feat: vllm omni image to video support (#6530)
Signed-off-by: ayushag <ayushag@nvidia.com>
1182e207 · Ayush Agarwal · GitHub · 5c7e66ec · 1182e207 · 1182e207
Unverified Commit 1182e207 authored Mar 12, 2026 by Ayush Agarwal Committed by GitHub Mar 12, 2026
7 changed files
--- a/components/src/dynamo/common/multimodal/image_loader.py
+++ b/components/src/dynamo/common/multimodal/image_loader.py
@@ -33,6 +33,7 @@ from .http_client import get_http_client
 logger = logging.getLogger(__name__)
 # Constants for multimodal data variants
 URL_VARIANT_KEY: Final = "Url"
 DECODED_VARIANT_KEY: Final = "Decoded"
@@ -87,6 +88,16 @@ class ImageLoader:
                        raise ValueError("Empty response content from image URL")
                    image_data = BytesIO(response.content)
+            elif parsed_url.scheme in ("", "file"):
+                # Local file path (plain path or file:// URI)
+                path = image_url if parsed_url.scheme == "" else parsed_url.path
+                def _read_local_file(p: str) -> bytes:
+                    with open(p, "rb") as f:
+                        return f.read()
+                image_bytes = await asyncio.to_thread(_read_local_file, path)
+                image_data = BytesIO(image_bytes)
            else:
                raise ValueError(f"Invalid image source scheme: {parsed_url.scheme}")

--- a/components/src/dynamo/common/protocols/video_protocol.py
+++ b/components/src/dynamo/common/protocols/video_protocol.py
@@ -40,6 +40,12 @@ class VideoNvExt(BaseModel):
    seed: Optional[int] = None
    """Random seed for reproducibility."""
+    boundary_ratio: Optional[float] = None
+    """MoE expert switching boundary as a fraction of the denoising schedule (vLLM-Omni I2V)."""
+    guidance_scale_2: Optional[float] = None
+    """CFG scale for the low-noise expert (vLLM-Omni I2V dual-guidance)."""
 class NvCreateVideoRequest(BaseModel):
    """Request for video generation (/v1/videos endpoint).

--- a/components/src/dynamo/vllm/omni/omni_handler.py
+++ b/components/src/dynamo/vllm/omni/omni_handler.py
@@ -10,10 +10,12 @@ from dataclasses import dataclass
 from io import BytesIO
 from typing import Any, AsyncGenerator, Dict, Optional, Union
+import PIL.Image
 from diffusers.utils import export_to_video
 from fsspec.implementations.dirfs import DirFileSystem
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniTextPrompt
+from dynamo.common.multimodal import ImageLoader
 from dynamo.common.protocols.image_protocol import (
    ImageData,
    NvCreateImageRequest,
@@ -94,6 +96,7 @@ class OmniHandler(BaseOmniHandler):
        )
        self.media_output_fs = media_output_fs
        self.media_output_http_url = media_output_http_url
+        self._image_loader = ImageLoader()
    async def generate(
        self, request: Dict[str, Any], context
@@ -121,7 +124,30 @@ class OmniHandler(BaseOmniHandler):
        parsed_request, request_type = parse_request_type(
            request, self.config.output_modalities
        )
-        inputs = self.build_engine_inputs(parsed_request, request_type)
+        # Pre-load input image for I2V requests (async I/O before sync build)
+        image = None
+        if (
+            request_type == RequestType.VIDEO_GENERATION
+            and isinstance(parsed_request, NvCreateVideoRequest)
+            and parsed_request.input_reference
+        ):
+            try:
+                image = await self._image_loader.load_image(
+                    parsed_request.input_reference
+                )
+            except Exception as e:
+                logger.warning("Failed to load I2V input_reference: %s", e)
+                yield {
+                    "id": request_id,
+                    "object": "video",
+                    "model": self.config.model,
+                    "status": "failed",
+                    "error": f"Failed to load input_reference: {e}",
+                }
+                return
+        inputs = self.build_engine_inputs(parsed_request, request_type, image=image)
        generate_kwargs: Dict[str, Any] = {
            "prompt": inputs.prompt,
@@ -187,6 +213,7 @@ class OmniHandler(BaseOmniHandler):
            NvCreateImageRequest, NvCreateVideoRequest, Dict[str, Any]
        ],
        request_type: RequestType,
+        image: PIL.Image.Image | None = None,
    ) -> EngineInputs:
        """Convert a parsed request into AsyncOmni engine inputs.
@@ -194,6 +221,7 @@ class OmniHandler(BaseOmniHandler):
            parsed_request: Output from parse_request_type -- a Pydantic model
                for image/video requests, or a raw dict for chat completions.
            request_type: The RequestType determined by parse_request_type.
+            image: Pre-loaded PIL Image for I2V requests (from input_reference).
        Returns:
            EngineInputs ready for engine_client.generate().
@@ -203,7 +231,7 @@ class OmniHandler(BaseOmniHandler):
        elif request_type == RequestType.IMAGE_GENERATION:
            return self._engine_inputs_from_image(parsed_request)
        elif request_type == RequestType.VIDEO_GENERATION:
-            return self._engine_inputs_from_video(parsed_request)
+            return self._engine_inputs_from_video(parsed_request, image=image)
        elif request_type == RequestType.AUDIO_GENERATION:
            raise NotImplementedError("Audio generation is not yet supported")
@@ -264,8 +292,19 @@ class OmniHandler(BaseOmniHandler):
            response_format=req.response_format,
        )
-    def _engine_inputs_from_video(self, req: NvCreateVideoRequest) -> EngineInputs:
+    def _engine_inputs_from_video(
-        """Build engine inputs from an NvCreateVideoRequest."""
+        self,
+        req: NvCreateVideoRequest,
+        image: PIL.Image.Image | None = None,
+    ) -> EngineInputs:
+        """Build engine inputs from an NvCreateVideoRequest.
+        Args:
+            req: Parsed video generation request.
+            image: Pre-loaded PIL Image for I2V. When provided, the image is
+                attached to the prompt via ``multi_modal_data`` so vllm-omni's
+                I2V pipeline pre-process can use it.
+        """
        width, height = parse_size(req.size)
        nvext = req.nvext
@@ -287,6 +326,14 @@ class OmniHandler(BaseOmniHandler):
            else None,
        )
+        if image is not None:
+            prompt["multi_modal_data"] = {"image": image}
+            logger.info(
+                "I2V: attached image (%dx%d) to multi_modal_data",
+                image.size[0],
+                image.size[1],
+            )
        sp = OmniDiffusionSamplingParams(
            height=height,
            width=width,
@@ -299,6 +346,10 @@ class OmniHandler(BaseOmniHandler):
                sp.guidance_scale = nvext.guidance_scale
            if nvext.seed is not None:
                sp.seed = nvext.seed
+            if nvext.boundary_ratio is not None:
+                sp.boundary_ratio = nvext.boundary_ratio
+            if nvext.guidance_scale_2 is not None:
+                sp.guidance_scale_2 = nvext.guidance_scale_2
        if fps is not None:
            sp.fps = fps

--- a/components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
@@ -6,8 +6,10 @@ from unittest.mock import MagicMock, patch
 import pytest
 try:
+    from PIL import Image
    from dynamo.common.protocols.image_protocol import NvCreateImageRequest
-    from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
+    from dynamo.common.protocols.video_protocol import NvCreateVideoRequest, VideoNvExt
    from dynamo.common.utils.output_modalities import RequestType
    from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler
 except ImportError:
@@ -247,3 +249,60 @@ class TestFormatVideoChunk:
            chunk = await handler._format_video_chunk([MagicMock()], "req-1", fps=16)
        assert chunk["status"] == "failed"
        assert "boom" in chunk["error"]
+class TestI2VEngineInputs:
+    """Tests for image-to-video: multi_modal_data attachment, I2V nvext params, and protocol fields."""
+    def test_t2v_no_multi_modal_data_and_i2v_attaches_image(self):
+        """T2V has no multi_modal_data; I2V attaches image to prompt."""
+        handler = _make_handler()
+        req = NvCreateVideoRequest(
+            prompt="a drone", model="test", size="832x480", seconds=2
+        )
+        # T2V: no image
+        t2v = handler.build_engine_inputs(req, RequestType.VIDEO_GENERATION)
+        assert "multi_modal_data" not in t2v.prompt
+        # I2V: image attached
+        img = Image.new("RGB", (64, 64), color="red")
+        i2v = handler.build_engine_inputs(req, RequestType.VIDEO_GENERATION, image=img)
+        assert i2v.prompt["multi_modal_data"]["image"] is img
+    def test_i2v_nvext_params_on_sampling_params(self):
+        """boundary_ratio and guidance_scale_2 are forwarded to sampling params."""
+        handler = _make_handler()
+        req = NvCreateVideoRequest(
+            prompt="bear",
+            model="test",
+            size="832x480",
+            nvext=VideoNvExt(
+                boundary_ratio=0.875, guidance_scale_2=1.0, num_inference_steps=40
+            ),
+        )
+        sp = handler.build_engine_inputs(
+            req, RequestType.VIDEO_GENERATION
+        ).sampling_params_list[0]
+        assert sp.boundary_ratio == 0.875
+        assert sp.guidance_scale_2 == 1.0
+        assert sp.num_inference_steps == 40
+    def test_i2v_protocol_roundtrip(self):
+        """VideoNvExt and NvCreateVideoRequest serialize/deserialize I2V fields correctly."""
+        req = NvCreateVideoRequest(
+            prompt="bear playing",
+            model="Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+            input_reference="/tmp/bear.png",
+            size="832x480",
+            nvext=VideoNvExt(boundary_ratio=0.9, guidance_scale_2=2.0, seed=42),
+        )
+        data = req.model_dump()
+        assert data["input_reference"] == "/tmp/bear.png"
+        assert data["nvext"]["boundary_ratio"] == 0.9
+        assert data["nvext"]["guidance_scale_2"] == 2.0
+        # Defaults are None
+        empty = VideoNvExt()
+        assert empty.boundary_ratio is None
+        assert empty.guidance_scale_2 is None
--- a/docs/backends/vllm/vllm-omni.md
+++ b/docs/backends/vllm/vllm-omni.md
@@ -25,6 +25,7 @@ pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1
 | Text-to-Text | `/v1/chat/completions` | `text` (default) |
 | Text-to-Image | `/v1/chat/completions`, `/v1/images/generations` | `image` |
 | Text-to-Video | `/v1/videos` | `video` |
+| Image-to-Video | `/v1/videos` | `video` |
 The `--output-modalities` flag determines which endpoint(s) the worker registers. When set to `image`, both `/v1/chat/completions` (returns inline base64 images) and `/v1/images/generations` are available. When set to `video`, the worker serves `/v1/videos`.
@@ -35,6 +36,7 @@ The `--output-modalities` flag determines which endpoint(s) the worker registers
 | Text-to-Text | `Qwen/Qwen2.5-Omni-7B` |
 | Text-to-Image | `Qwen/Qwen-Image`, `AIDC-AI/Ovis-Image-7B` |
 | Text-to-Video | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
+| Image-to-Video | `Wan-AI/Wan2.2-TI2V-5B-Diffusers`, `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
 To run a non-default model, pass `--model` to any launch script:
@@ -159,6 +161,47 @@ The `/v1/videos` endpoint also accepts NVIDIA extensions via the `nvext` field f
 | `nvext.num_inference_steps` | Number of denoising steps | 50 |
 | `nvext.guidance_scale` | CFG guidance scale | 5.0 |
 | `nvext.seed` | Random seed for reproducibility | -- |
+| `nvext.boundary_ratio` | MoE expert switching boundary (I2V) | 0.875 |
+| `nvext.guidance_scale_2` | CFG scale for low-noise expert (I2V) | 1.0 |
+## Image-to-Video
+Image-to-video (I2V) uses the same `/v1/videos` endpoint as text-to-video, with an additional `input_reference` field that provides the source image. The image can be an HTTP URL, a base64 data URI, or a local file path.
+Launch with the provided script using `Wan-AI/Wan2.2-TI2V-5B-Diffusers`:
+```bash
+bash examples/backends/vllm/launch/agg_omni_i2v.sh
+```
+Generate a video from an image:
+```bash
+curl -s http://localhost:8000/v1/videos \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+    "prompt": "A bear playing with yarn, smooth motion",
+    "input_reference": "https://example.com/bear.png",
+    "size": "832x480",
+    "response_format": "url",
+    "nvext": {
+      "num_inference_steps": 40,
+      "num_frames": 33,
+      "guidance_scale": 1.0,
+      "boundary_ratio": 0.875,
+      "guidance_scale_2": 1.0,
+      "seed": 42
+    }
+  }'
+```
+The `input_reference` field accepts:
+- **HTTP/HTTPS URL**: `"https://example.com/image.png"`
+- **Base64 data URI**: `"data:image/png;base64,iVBORw0KGgo..."`
+- **Local file path**: `"/path/to/image.png"` or `"file:///path/to/image.png"`
+The I2V-specific `nvext` fields (`boundary_ratio`, `guidance_scale_2`) control the dual-expert MoE denoising schedule in Wan2.x models. See [Wan2.2-I2V model card](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers) for details.
 ## CLI Reference
@@ -192,6 +235,6 @@ Omni pipelines are configured via YAML stage configs. See [`examples/backends/vl
 ## Current Limitations
- Only text prompts are supported as input (no multimodal input yet).
+- Image input is supported only for I2V via `input_reference` in `/v1/videos`. Other endpoints accept text prompts only.
 - KV cache events are not published for omni workers.
 - Each worker supports a single output modality at a time.
--- a/examples/backends/vllm/launch/agg_omni_i2v.sh
+++ b/examples/backends/vllm/launch/agg_omni_i2v.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Launch an aggregated vLLM-Omni deployment for image-to-video (I2V).
+#
+# Usage:
+#   bash agg_omni_i2v.sh [OPTIONS]
+#
+# Options:
+#   --model <model>   Model to use (default: Wan-AI/Wan2.2-TI2V-5B-Diffusers)
+#   Any other flags are forwarded to the vLLM worker.
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+MODEL="Wan-AI/Wan2.2-TI2V-5B-Diffusers"
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            if [[ $# -lt 2 || "$2" == --* ]]; then
+                echo "Error: --model requires a value" >&2
+                exit 1
+            fi
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+echo "=========================================="
+echo "Starting vLLM-Omni I2V Worker"
+echo "Model: $MODEL"
+echo "=========================================="
+echo "Starting frontend on port ${DYN_HTTP_PORT:-8000}..."
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+sleep 2
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm \
+    --model "$MODEL" \
+    --omni \
+    --output-modalities video \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    "${EXTRA_ARGS[@]}"
--- a/lib/llm/src/protocols/openai/videos/nvext.rs
+++ b/lib/llm/src/protocols/openai/videos/nvext.rs
@@ -50,6 +50,16 @@ pub struct NvExt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub seed: Option<i64>,
+    /// MoE expert switching boundary as a fraction of the denoising schedule (vLLM-Omni I2V).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub boundary_ratio: Option<f32>,
+    /// CFG scale for the low-noise expert (vLLM-Omni I2V dual-guidance).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub guidance_scale_2: Option<f32>,
 }
 impl Default for NvExt {