chore: install vllm-omni in vllm container (#6458)

Signed-off-by: ayushag <ayushag@nvidia.com>

chore: install vllm-omni in vllm container (#6458)
Signed-off-by: ayushag <ayushag@nvidia.com>
e94f4647 · Ayush Agarwal · GitHub · efa89448 · e94f4647 · e94f4647
Unverified Commit e94f4647 authored Feb 24, 2026 by Ayush Agarwal Committed by GitHub Feb 24, 2026
7 changed files
--- a/components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
@@ -5,16 +5,11 @@ from unittest.mock import MagicMock, patch

 import pytest

-from dynamo.common.protocols.image_protocol import NvCreateImageRequest
-from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
-from dynamo.common.utils.output_modalities import RequestType
-
 try:
-    from dynamo.vllm.omni.omni_handler import (
-        EngineInputs,
-        OmniHandler,
-        prepare_image_output,
-    )
+    from dynamo.common.protocols.image_protocol import NvCreateImageRequest
+    from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
+    from dynamo.common.utils.output_modalities import RequestType
+    from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler
 except ImportError:
    pytest.skip("vLLM omni dependencies not available", allow_module_level=True)

@@ -51,32 +46,40 @@ class TestEngineInputs:


 class TestPrepareImageOutput:
-    def test_b64_json(self):
+    @pytest.mark.asyncio
+    async def test_b64_json(self):
        """b64_json format returns data URI with base64 prefix."""
+        handler = _make_handler()
        img = MagicMock()
        img.save = lambda b, format: b.write(b"fake_png_data")
-        results = prepare_image_output([img], "b64_json")
+        results = await handler._prepare_image_output([img], "req-1", "b64_json")
        assert len(results) == 1
        assert results[0].startswith("data:image/png;base64,")

-    def test_b64_default_when_none(self):
+    @pytest.mark.asyncio
+    async def test_b64_default_when_none(self):
        """None response_format defaults to base64 encoding."""
+        handler = _make_handler()
        img = MagicMock()
        img.save = lambda b, format: b.write(b"data")
-        results = prepare_image_output([img], None)
+        results = await handler._prepare_image_output([img], "req-1", None)
        assert results[0].startswith("data:image/png;base64,")

-    def test_invalid_format(self):
+    @pytest.mark.asyncio
+    async def test_invalid_format(self):
        """Unsupported response_format raises ValueError."""
+        handler = _make_handler()
        with pytest.raises(ValueError, match="Invalid response format"):
-            prepare_image_output([MagicMock()], "invalid")
+            await handler._prepare_image_output([MagicMock()], "req-1", "invalid")

-    def test_multiple_images(self):
+    @pytest.mark.asyncio
+    async def test_multiple_images(self):
        """Multiple input images produce one output entry each."""
+        handler = _make_handler()
        imgs = [MagicMock() for _ in range(3)]
        for img in imgs:
            img.save = lambda b, format: b.write(b"px")
-        results = prepare_image_output(imgs, "b64_json")
+        results = await handler._prepare_image_output(imgs, "req-1", "b64_json")
        assert len(results) == 3


@@ -160,23 +163,25 @@ class TestFormatTextChunk:


 class TestFormatImageChunk:
-    def test_chat_completion_format(self):
+    @pytest.mark.asyncio
+    async def test_chat_completion_format(self):
        """Chat completion route returns image_url content parts."""
        handler = _make_handler()
        img = MagicMock()
        img.save = lambda b, format: b.write(b"px")
-        chunk = handler._format_image_chunk(
+        chunk = await handler._format_image_chunk(
            [img], "req-1", request_type=RequestType.CHAT_COMPLETION
        )
        assert chunk["object"] == "chat.completion.chunk"
        assert chunk["choices"][0]["delta"]["content"][0]["type"] == "image_url"

-    def test_image_generation_b64_format(self):
+    @pytest.mark.asyncio
+    async def test_image_generation_b64_format(self):
        """Image generation with b64_json format returns base64 data."""
        handler = _make_handler()
        img = MagicMock()
        img.save = lambda b, format: b.write(b"px")
-        chunk = handler._format_image_chunk(
+        chunk = await handler._format_image_chunk(
            [img],
            "req-1",
            response_format="b64_json",
@@ -184,12 +189,13 @@ class TestFormatImageChunk:
        )
        assert chunk["data"][0]["b64_json"] is not None

-    def test_image_generation_default_format_returns_b64(self):
+    @pytest.mark.asyncio
+    async def test_image_generation_default_format_returns_b64(self):
        """Image generation with response_format=None defaults to b64_json."""
        handler = _make_handler()
        img = MagicMock()
        img.save = lambda b, format: b.write(b"px")
-        chunk = handler._format_image_chunk(
+        chunk = await handler._format_image_chunk(
            [img],
            "req-1",
            response_format=None,
@@ -197,10 +203,11 @@ class TestFormatImageChunk:
        )
        assert chunk["data"][0]["b64_json"] is not None

-    def test_empty_images_returns_error(self):
+    @pytest.mark.asyncio
+    async def test_empty_images_returns_error(self):
        """Empty image list produces an error chunk."""
        handler = _make_handler()
-        chunk = handler._format_image_chunk([], "req-1")
+        chunk = await handler._format_image_chunk([], "req-1")
        assert "Error" in chunk["choices"][0]["delta"]["content"]



--- a/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
@@ -358,7 +358,7 @@ class TestVllmRendererApi:
        position. vllm_processor.py constructs EngineCoreOutput by keyword
        and reads fields from EngineCoreRequest positionally.
        """
-        expected_request_fields = (
+        base_request_fields = (
            "request_id",
            "prompt_token_ids",
            "mm_features",
@@ -377,10 +377,14 @@ class TestVllmRendererApi:
            "resumable",
            "external_req_id",
        )
+        # vllm-omni monkey-patches EngineCoreRequest with an extra field
+        # (only installed on amd64, not arm64)
+        omni_fields = base_request_fields + ("additional_information",)
        actual_request_fields = EngineCoreRequest.__struct_fields__
-        assert actual_request_fields == expected_request_fields, (
+        assert actual_request_fields in (base_request_fields, omni_fields), (
            "EngineCoreRequest fields changed!\n"
-            f"Expected: {expected_request_fields}\n"
+            f"Expected (base): {base_request_fields}\n"
+            f"Expected (omni): {omni_fields}\n"
            f"Actual:          {actual_request_fields}\n"
            "Update request construction in components/src/dynamo/frontend/vllm_processor.py"
        )

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -45,6 +45,7 @@ vllm:
  vllm_ref: v0.15.1
  flashinf_ref: v0.6.1
  lmcache_ref: 0.3.13
+  vllm_omni_ref: "0.14.0"
  max_jobs: "10"
  enable_media_ffmpeg: "true"
  enable_gpu_memory_service: "true"

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -6,8 +6,9 @@
 # Installation order:
 # 1. LMCache (installed first so vLLM's dependencies take precedence)
 # 2. vLLM
-# 3. DeepGEMM
-# 4. EP kernels
+# 3. vLLM-Omni
+# 4. DeepGEMM
+# 5. EP kernels

 set -euo pipefail

@@ -25,6 +26,7 @@ DEEPGEMM_REF=""
 CUDA_VERSION="12.9"
 FLASHINF_REF="v0.6.1"
 LMCACHE_REF="0.3.13"
+VLLM_OMNI_REF="0.14.0"

 while [[ $# -gt 0 ]]; do
    case $1 in
@@ -56,6 +58,10 @@ while [[ $# -gt 0 ]]; do
            LMCACHE_REF="$2"
            shift 2
            ;;
+        --vllm-omni-ref)
+            VLLM_OMNI_REF="$2"
+            shift 2
+            ;;
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
@@ -65,7 +71,7 @@ while [[ $# -gt 0 ]]; do
            shift 2
            ;;
        -h|--help)
-            echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
+            echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
            echo "Options:"
            echo "  --vllm-ref REF      vLLM release version (default: ${VLLM_REF})"
            echo "  --max-jobs NUM      Maximum parallel jobs (default: ${MAX_JOBS})"
@@ -74,6 +80,7 @@ while [[ $# -gt 0 ]]; do
            echo "  --deepgemm-ref REF  DeepGEMM git ref (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  FlashInfer version (default: ${FLASHINF_REF})"
            echo "  --lmcache-ref REF   LMCache version (default: ${LMCACHE_REF})"
+            echo "  --vllm-omni-ref REF vLLM-Omni version (default: ${VLLM_OMNI_REF})"
            echo "  --torch-cuda-arch-list LIST  CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version (default: ${CUDA_VERSION})"
            exit 0
@@ -160,6 +167,19 @@ else
 fi
 echo "✓ vLLM installation completed"

+echo "\n=== Installing vLLM-Omni ==="
+if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
+    # Save original vllm entrypoint before vllm-omni overwrites it
+    VLLM_BIN=$(which vllm)
+    cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
+    uv pip install vllm-omni==${VLLM_OMNI_REF}
+    # Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
+    cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
+    echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed (original vllm entrypoint preserved)"
+else
+    echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
+fi
+
 echo "\n=== Installing DeepGEMM ==="
 cd $INSTALLATION_DIR/vllm/tools
 if [ -n "$DEEPGEMM_REF" ]; then

--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -71,6 +71,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
 ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
 ARG LMCACHE_REF={{ context.vllm.lmcache_ref }}
+ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}

 # If left blank, then we will fallback to vLLM defaults
 ARG DEEPGEMM_REF=""

--- a/container/templates/vllm_framework.Dockerfile
+++ b/container/templates/vllm_framework.Dockerfile
@@ -68,6 +68,7 @@ ARG VLLM_GIT_URL
 ARG DEEPGEMM_REF
 ARG FLASHINF_REF
 ARG LMCACHE_REF
+ARG VLLM_OMNI_REF
 ARG CUDA_VERSION

 ARG MAX_JOBS
@@ -88,6 +89,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
        ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
        ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
        ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
+        ${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
        --cuda-version $CUDA_VERSION

 ENV LD_LIBRARY_PATH=\

--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -140,6 +140,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SIT
 COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
 COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
 COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
+{% if platform == "amd64" -%}
+COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
+{% endif -%}
 COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
 COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
 # Remaining packages and venv structure (bin/, include/, share/, etc.)
@@ -148,6 +151,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework \
    --exclude=lib/python*/site-packages/flashinfer_jit_cache \
    --exclude=lib/python*/site-packages/torch \
    --exclude=lib/python*/site-packages/vllm \
+{%- if platform == "amd64" %}
+    --exclude=lib/python*/site-packages/vllm_omni \
+{%- endif %}
    --exclude=lib/python*/site-packages/triton \
    --exclude=lib/python*/site-packages/flashinfer_cubin \
    ${VIRTUAL_ENV} ${VIRTUAL_ENV}