Unverified Commit e94f4647 authored by Ayush Agarwal's avatar Ayush Agarwal Committed by GitHub
Browse files

chore: install vllm-omni in vllm container (#6458)


Signed-off-by: default avatarayushag <ayushag@nvidia.com>
parent efa89448
......@@ -5,16 +5,11 @@ from unittest.mock import MagicMock, patch
import pytest
from dynamo.common.protocols.image_protocol import NvCreateImageRequest
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
from dynamo.common.utils.output_modalities import RequestType
try:
from dynamo.vllm.omni.omni_handler import (
EngineInputs,
OmniHandler,
prepare_image_output,
)
from dynamo.common.protocols.image_protocol import NvCreateImageRequest
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
from dynamo.common.utils.output_modalities import RequestType
from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler
except ImportError:
pytest.skip("vLLM omni dependencies not available", allow_module_level=True)
......@@ -51,32 +46,40 @@ class TestEngineInputs:
class TestPrepareImageOutput:
def test_b64_json(self):
@pytest.mark.asyncio
async def test_b64_json(self):
"""b64_json format returns data URI with base64 prefix."""
handler = _make_handler()
img = MagicMock()
img.save = lambda b, format: b.write(b"fake_png_data")
results = prepare_image_output([img], "b64_json")
results = await handler._prepare_image_output([img], "req-1", "b64_json")
assert len(results) == 1
assert results[0].startswith("data:image/png;base64,")
def test_b64_default_when_none(self):
@pytest.mark.asyncio
async def test_b64_default_when_none(self):
"""None response_format defaults to base64 encoding."""
handler = _make_handler()
img = MagicMock()
img.save = lambda b, format: b.write(b"data")
results = prepare_image_output([img], None)
results = await handler._prepare_image_output([img], "req-1", None)
assert results[0].startswith("data:image/png;base64,")
def test_invalid_format(self):
@pytest.mark.asyncio
async def test_invalid_format(self):
"""Unsupported response_format raises ValueError."""
handler = _make_handler()
with pytest.raises(ValueError, match="Invalid response format"):
prepare_image_output([MagicMock()], "invalid")
await handler._prepare_image_output([MagicMock()], "req-1", "invalid")
def test_multiple_images(self):
@pytest.mark.asyncio
async def test_multiple_images(self):
"""Multiple input images produce one output entry each."""
handler = _make_handler()
imgs = [MagicMock() for _ in range(3)]
for img in imgs:
img.save = lambda b, format: b.write(b"px")
results = prepare_image_output(imgs, "b64_json")
results = await handler._prepare_image_output(imgs, "req-1", "b64_json")
assert len(results) == 3
......@@ -160,23 +163,25 @@ class TestFormatTextChunk:
class TestFormatImageChunk:
def test_chat_completion_format(self):
@pytest.mark.asyncio
async def test_chat_completion_format(self):
"""Chat completion route returns image_url content parts."""
handler = _make_handler()
img = MagicMock()
img.save = lambda b, format: b.write(b"px")
chunk = handler._format_image_chunk(
chunk = await handler._format_image_chunk(
[img], "req-1", request_type=RequestType.CHAT_COMPLETION
)
assert chunk["object"] == "chat.completion.chunk"
assert chunk["choices"][0]["delta"]["content"][0]["type"] == "image_url"
def test_image_generation_b64_format(self):
@pytest.mark.asyncio
async def test_image_generation_b64_format(self):
"""Image generation with b64_json format returns base64 data."""
handler = _make_handler()
img = MagicMock()
img.save = lambda b, format: b.write(b"px")
chunk = handler._format_image_chunk(
chunk = await handler._format_image_chunk(
[img],
"req-1",
response_format="b64_json",
......@@ -184,12 +189,13 @@ class TestFormatImageChunk:
)
assert chunk["data"][0]["b64_json"] is not None
def test_image_generation_default_format_returns_b64(self):
@pytest.mark.asyncio
async def test_image_generation_default_format_returns_b64(self):
"""Image generation with response_format=None defaults to b64_json."""
handler = _make_handler()
img = MagicMock()
img.save = lambda b, format: b.write(b"px")
chunk = handler._format_image_chunk(
chunk = await handler._format_image_chunk(
[img],
"req-1",
response_format=None,
......@@ -197,10 +203,11 @@ class TestFormatImageChunk:
)
assert chunk["data"][0]["b64_json"] is not None
def test_empty_images_returns_error(self):
@pytest.mark.asyncio
async def test_empty_images_returns_error(self):
"""Empty image list produces an error chunk."""
handler = _make_handler()
chunk = handler._format_image_chunk([], "req-1")
chunk = await handler._format_image_chunk([], "req-1")
assert "Error" in chunk["choices"][0]["delta"]["content"]
......
......@@ -358,7 +358,7 @@ class TestVllmRendererApi:
position. vllm_processor.py constructs EngineCoreOutput by keyword
and reads fields from EngineCoreRequest positionally.
"""
expected_request_fields = (
base_request_fields = (
"request_id",
"prompt_token_ids",
"mm_features",
......@@ -377,10 +377,14 @@ class TestVllmRendererApi:
"resumable",
"external_req_id",
)
# vllm-omni monkey-patches EngineCoreRequest with an extra field
# (only installed on amd64, not arm64)
omni_fields = base_request_fields + ("additional_information",)
actual_request_fields = EngineCoreRequest.__struct_fields__
assert actual_request_fields == expected_request_fields, (
assert actual_request_fields in (base_request_fields, omni_fields), (
"EngineCoreRequest fields changed!\n"
f"Expected: {expected_request_fields}\n"
f"Expected (base): {base_request_fields}\n"
f"Expected (omni): {omni_fields}\n"
f"Actual: {actual_request_fields}\n"
"Update request construction in components/src/dynamo/frontend/vllm_processor.py"
)
......
......@@ -45,6 +45,7 @@ vllm:
vllm_ref: v0.15.1
flashinf_ref: v0.6.1
lmcache_ref: 0.3.13
vllm_omni_ref: "0.14.0"
max_jobs: "10"
enable_media_ffmpeg: "true"
enable_gpu_memory_service: "true"
......
......@@ -6,8 +6,9 @@
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
# 3. DeepGEMM
# 4. EP kernels
# 3. vLLM-Omni
# 4. DeepGEMM
# 5. EP kernels
set -euo pipefail
......@@ -25,6 +26,7 @@ DEEPGEMM_REF=""
CUDA_VERSION="12.9"
FLASHINF_REF="v0.6.1"
LMCACHE_REF="0.3.13"
VLLM_OMNI_REF="0.14.0"
while [[ $# -gt 0 ]]; do
case $1 in
......@@ -56,6 +58,10 @@ while [[ $# -gt 0 ]]; do
LMCACHE_REF="$2"
shift 2
;;
--vllm-omni-ref)
VLLM_OMNI_REF="$2"
shift 2
;;
--torch-cuda-arch-list)
TORCH_CUDA_ARCH_LIST="$2"
shift 2
......@@ -65,7 +71,7 @@ while [[ $# -gt 0 ]]; do
shift 2
;;
-h|--help)
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:"
echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})"
......@@ -74,6 +80,7 @@ while [[ $# -gt 0 ]]; do
echo " --deepgemm-ref REF DeepGEMM git ref (default: ${DEEPGEMM_REF})"
echo " --flashinf-ref REF FlashInfer version (default: ${FLASHINF_REF})"
echo " --lmcache-ref REF LMCache version (default: ${LMCACHE_REF})"
echo " --vllm-omni-ref REF vLLM-Omni version (default: ${VLLM_OMNI_REF})"
echo " --torch-cuda-arch-list LIST CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
echo " --cuda-version VERSION CUDA version (default: ${CUDA_VERSION})"
exit 0
......@@ -160,6 +167,19 @@ else
fi
echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ==="
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it
VLLM_BIN=$(which vllm)
cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
uv pip install vllm-omni==${VLLM_OMNI_REF}
# Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed (original vllm entrypoint preserved)"
else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then
......
......@@ -71,6 +71,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
ARG LMCACHE_REF={{ context.vllm.lmcache_ref }}
ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
......
......@@ -68,6 +68,7 @@ ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG LMCACHE_REF
ARG VLLM_OMNI_REF
ARG CUDA_VERSION
ARG MAX_JOBS
......@@ -88,6 +89,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
--cuda-version $CUDA_VERSION
ENV LD_LIBRARY_PATH=\
......
......@@ -140,6 +140,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SIT
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
# Remaining packages and venv structure (bin/, include/, share/, etc.)
......@@ -148,6 +151,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework \
--exclude=lib/python*/site-packages/flashinfer_jit_cache \
--exclude=lib/python*/site-packages/torch \
--exclude=lib/python*/site-packages/vllm \
{%- if platform == "amd64" %}
--exclude=lib/python*/site-packages/vllm_omni \
{%- endif %}
--exclude=lib/python*/site-packages/triton \
--exclude=lib/python*/site-packages/flashinfer_cubin \
${VIRTUAL_ENV} ${VIRTUAL_ENV}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment