Unverified Commit e94f4647 authored by Ayush Agarwal's avatar Ayush Agarwal Committed by GitHub
Browse files

chore: install vllm-omni in vllm container (#6458)


Signed-off-by: default avatarayushag <ayushag@nvidia.com>
parent efa89448
...@@ -5,16 +5,11 @@ from unittest.mock import MagicMock, patch ...@@ -5,16 +5,11 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
from dynamo.common.protocols.image_protocol import NvCreateImageRequest
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
from dynamo.common.utils.output_modalities import RequestType
try: try:
from dynamo.vllm.omni.omni_handler import ( from dynamo.common.protocols.image_protocol import NvCreateImageRequest
EngineInputs, from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
OmniHandler, from dynamo.common.utils.output_modalities import RequestType
prepare_image_output, from dynamo.vllm.omni.omni_handler import EngineInputs, OmniHandler
)
except ImportError: except ImportError:
pytest.skip("vLLM omni dependencies not available", allow_module_level=True) pytest.skip("vLLM omni dependencies not available", allow_module_level=True)
...@@ -51,32 +46,40 @@ class TestEngineInputs: ...@@ -51,32 +46,40 @@ class TestEngineInputs:
class TestPrepareImageOutput: class TestPrepareImageOutput:
def test_b64_json(self): @pytest.mark.asyncio
async def test_b64_json(self):
"""b64_json format returns data URI with base64 prefix.""" """b64_json format returns data URI with base64 prefix."""
handler = _make_handler()
img = MagicMock() img = MagicMock()
img.save = lambda b, format: b.write(b"fake_png_data") img.save = lambda b, format: b.write(b"fake_png_data")
results = prepare_image_output([img], "b64_json") results = await handler._prepare_image_output([img], "req-1", "b64_json")
assert len(results) == 1 assert len(results) == 1
assert results[0].startswith("data:image/png;base64,") assert results[0].startswith("data:image/png;base64,")
def test_b64_default_when_none(self): @pytest.mark.asyncio
async def test_b64_default_when_none(self):
"""None response_format defaults to base64 encoding.""" """None response_format defaults to base64 encoding."""
handler = _make_handler()
img = MagicMock() img = MagicMock()
img.save = lambda b, format: b.write(b"data") img.save = lambda b, format: b.write(b"data")
results = prepare_image_output([img], None) results = await handler._prepare_image_output([img], "req-1", None)
assert results[0].startswith("data:image/png;base64,") assert results[0].startswith("data:image/png;base64,")
def test_invalid_format(self): @pytest.mark.asyncio
async def test_invalid_format(self):
"""Unsupported response_format raises ValueError.""" """Unsupported response_format raises ValueError."""
handler = _make_handler()
with pytest.raises(ValueError, match="Invalid response format"): with pytest.raises(ValueError, match="Invalid response format"):
prepare_image_output([MagicMock()], "invalid") await handler._prepare_image_output([MagicMock()], "req-1", "invalid")
def test_multiple_images(self): @pytest.mark.asyncio
async def test_multiple_images(self):
"""Multiple input images produce one output entry each.""" """Multiple input images produce one output entry each."""
handler = _make_handler()
imgs = [MagicMock() for _ in range(3)] imgs = [MagicMock() for _ in range(3)]
for img in imgs: for img in imgs:
img.save = lambda b, format: b.write(b"px") img.save = lambda b, format: b.write(b"px")
results = prepare_image_output(imgs, "b64_json") results = await handler._prepare_image_output(imgs, "req-1", "b64_json")
assert len(results) == 3 assert len(results) == 3
...@@ -160,23 +163,25 @@ class TestFormatTextChunk: ...@@ -160,23 +163,25 @@ class TestFormatTextChunk:
class TestFormatImageChunk: class TestFormatImageChunk:
def test_chat_completion_format(self): @pytest.mark.asyncio
async def test_chat_completion_format(self):
"""Chat completion route returns image_url content parts.""" """Chat completion route returns image_url content parts."""
handler = _make_handler() handler = _make_handler()
img = MagicMock() img = MagicMock()
img.save = lambda b, format: b.write(b"px") img.save = lambda b, format: b.write(b"px")
chunk = handler._format_image_chunk( chunk = await handler._format_image_chunk(
[img], "req-1", request_type=RequestType.CHAT_COMPLETION [img], "req-1", request_type=RequestType.CHAT_COMPLETION
) )
assert chunk["object"] == "chat.completion.chunk" assert chunk["object"] == "chat.completion.chunk"
assert chunk["choices"][0]["delta"]["content"][0]["type"] == "image_url" assert chunk["choices"][0]["delta"]["content"][0]["type"] == "image_url"
def test_image_generation_b64_format(self): @pytest.mark.asyncio
async def test_image_generation_b64_format(self):
"""Image generation with b64_json format returns base64 data.""" """Image generation with b64_json format returns base64 data."""
handler = _make_handler() handler = _make_handler()
img = MagicMock() img = MagicMock()
img.save = lambda b, format: b.write(b"px") img.save = lambda b, format: b.write(b"px")
chunk = handler._format_image_chunk( chunk = await handler._format_image_chunk(
[img], [img],
"req-1", "req-1",
response_format="b64_json", response_format="b64_json",
...@@ -184,12 +189,13 @@ class TestFormatImageChunk: ...@@ -184,12 +189,13 @@ class TestFormatImageChunk:
) )
assert chunk["data"][0]["b64_json"] is not None assert chunk["data"][0]["b64_json"] is not None
def test_image_generation_default_format_returns_b64(self): @pytest.mark.asyncio
async def test_image_generation_default_format_returns_b64(self):
"""Image generation with response_format=None defaults to b64_json.""" """Image generation with response_format=None defaults to b64_json."""
handler = _make_handler() handler = _make_handler()
img = MagicMock() img = MagicMock()
img.save = lambda b, format: b.write(b"px") img.save = lambda b, format: b.write(b"px")
chunk = handler._format_image_chunk( chunk = await handler._format_image_chunk(
[img], [img],
"req-1", "req-1",
response_format=None, response_format=None,
...@@ -197,10 +203,11 @@ class TestFormatImageChunk: ...@@ -197,10 +203,11 @@ class TestFormatImageChunk:
) )
assert chunk["data"][0]["b64_json"] is not None assert chunk["data"][0]["b64_json"] is not None
def test_empty_images_returns_error(self): @pytest.mark.asyncio
async def test_empty_images_returns_error(self):
"""Empty image list produces an error chunk.""" """Empty image list produces an error chunk."""
handler = _make_handler() handler = _make_handler()
chunk = handler._format_image_chunk([], "req-1") chunk = await handler._format_image_chunk([], "req-1")
assert "Error" in chunk["choices"][0]["delta"]["content"] assert "Error" in chunk["choices"][0]["delta"]["content"]
......
...@@ -358,7 +358,7 @@ class TestVllmRendererApi: ...@@ -358,7 +358,7 @@ class TestVllmRendererApi:
position. vllm_processor.py constructs EngineCoreOutput by keyword position. vllm_processor.py constructs EngineCoreOutput by keyword
and reads fields from EngineCoreRequest positionally. and reads fields from EngineCoreRequest positionally.
""" """
expected_request_fields = ( base_request_fields = (
"request_id", "request_id",
"prompt_token_ids", "prompt_token_ids",
"mm_features", "mm_features",
...@@ -377,11 +377,15 @@ class TestVllmRendererApi: ...@@ -377,11 +377,15 @@ class TestVllmRendererApi:
"resumable", "resumable",
"external_req_id", "external_req_id",
) )
# vllm-omni monkey-patches EngineCoreRequest with an extra field
# (only installed on amd64, not arm64)
omni_fields = base_request_fields + ("additional_information",)
actual_request_fields = EngineCoreRequest.__struct_fields__ actual_request_fields = EngineCoreRequest.__struct_fields__
assert actual_request_fields == expected_request_fields, ( assert actual_request_fields in (base_request_fields, omni_fields), (
"EngineCoreRequest fields changed!\n" "EngineCoreRequest fields changed!\n"
f"Expected: {expected_request_fields}\n" f"Expected (base): {base_request_fields}\n"
f"Actual: {actual_request_fields}\n" f"Expected (omni): {omni_fields}\n"
f"Actual: {actual_request_fields}\n"
"Update request construction in components/src/dynamo/frontend/vllm_processor.py" "Update request construction in components/src/dynamo/frontend/vllm_processor.py"
) )
......
...@@ -45,6 +45,7 @@ vllm: ...@@ -45,6 +45,7 @@ vllm:
vllm_ref: v0.15.1 vllm_ref: v0.15.1
flashinf_ref: v0.6.1 flashinf_ref: v0.6.1
lmcache_ref: 0.3.13 lmcache_ref: 0.3.13
vllm_omni_ref: "0.14.0"
max_jobs: "10" max_jobs: "10"
enable_media_ffmpeg: "true" enable_media_ffmpeg: "true"
enable_gpu_memory_service: "true" enable_gpu_memory_service: "true"
......
...@@ -6,8 +6,9 @@ ...@@ -6,8 +6,9 @@
# Installation order: # Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence) # 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM # 2. vLLM
# 3. DeepGEMM # 3. vLLM-Omni
# 4. EP kernels # 4. DeepGEMM
# 5. EP kernels
set -euo pipefail set -euo pipefail
...@@ -25,6 +26,7 @@ DEEPGEMM_REF="" ...@@ -25,6 +26,7 @@ DEEPGEMM_REF=""
CUDA_VERSION="12.9" CUDA_VERSION="12.9"
FLASHINF_REF="v0.6.1" FLASHINF_REF="v0.6.1"
LMCACHE_REF="0.3.13" LMCACHE_REF="0.3.13"
VLLM_OMNI_REF="0.14.0"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
...@@ -56,6 +58,10 @@ while [[ $# -gt 0 ]]; do ...@@ -56,6 +58,10 @@ while [[ $# -gt 0 ]]; do
LMCACHE_REF="$2" LMCACHE_REF="$2"
shift 2 shift 2
;; ;;
--vllm-omni-ref)
VLLM_OMNI_REF="$2"
shift 2
;;
--torch-cuda-arch-list) --torch-cuda-arch-list)
TORCH_CUDA_ARCH_LIST="$2" TORCH_CUDA_ARCH_LIST="$2"
shift 2 shift 2
...@@ -65,7 +71,7 @@ while [[ $# -gt 0 ]]; do ...@@ -65,7 +71,7 @@ while [[ $# -gt 0 ]]; do
shift 2 shift 2
;; ;;
-h|--help) -h|--help)
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]" echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:" echo "Options:"
echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})" echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})" echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})"
...@@ -74,6 +80,7 @@ while [[ $# -gt 0 ]]; do ...@@ -74,6 +80,7 @@ while [[ $# -gt 0 ]]; do
echo " --deepgemm-ref REF DeepGEMM git ref (default: ${DEEPGEMM_REF})" echo " --deepgemm-ref REF DeepGEMM git ref (default: ${DEEPGEMM_REF})"
echo " --flashinf-ref REF FlashInfer version (default: ${FLASHINF_REF})" echo " --flashinf-ref REF FlashInfer version (default: ${FLASHINF_REF})"
echo " --lmcache-ref REF LMCache version (default: ${LMCACHE_REF})" echo " --lmcache-ref REF LMCache version (default: ${LMCACHE_REF})"
echo " --vllm-omni-ref REF vLLM-Omni version (default: ${VLLM_OMNI_REF})"
echo " --torch-cuda-arch-list LIST CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})" echo " --torch-cuda-arch-list LIST CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
echo " --cuda-version VERSION CUDA version (default: ${CUDA_VERSION})" echo " --cuda-version VERSION CUDA version (default: ${CUDA_VERSION})"
exit 0 exit 0
...@@ -160,6 +167,19 @@ else ...@@ -160,6 +167,19 @@ else
fi fi
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ==="
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it
VLLM_BIN=$(which vllm)
cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
uv pip install vllm-omni==${VLLM_OMNI_REF}
# Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed (original vllm entrypoint preserved)"
else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
echo "\n=== Installing DeepGEMM ===" echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then if [ -n "$DEEPGEMM_REF" ]; then
......
...@@ -71,6 +71,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }} ...@@ -71,6 +71,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF={{ context.vllm.flashinf_ref }} ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
ARG LMCACHE_REF={{ context.vllm.lmcache_ref }} ARG LMCACHE_REF={{ context.vllm.lmcache_ref }}
ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
# If left blank, then we will fallback to vLLM defaults # If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF="" ARG DEEPGEMM_REF=""
......
...@@ -68,6 +68,7 @@ ARG VLLM_GIT_URL ...@@ -68,6 +68,7 @@ ARG VLLM_GIT_URL
ARG DEEPGEMM_REF ARG DEEPGEMM_REF
ARG FLASHINF_REF ARG FLASHINF_REF
ARG LMCACHE_REF ARG LMCACHE_REF
ARG VLLM_OMNI_REF
ARG CUDA_VERSION ARG CUDA_VERSION
ARG MAX_JOBS ARG MAX_JOBS
...@@ -88,6 +89,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ ...@@ -88,6 +89,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \ ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \ ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \ ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
--cuda-version $CUDA_VERSION --cuda-version $CUDA_VERSION
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
......
...@@ -140,6 +140,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SIT ...@@ -140,6 +140,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SIT
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
# Remaining packages and venv structure (bin/, include/, share/, etc.) # Remaining packages and venv structure (bin/, include/, share/, etc.)
...@@ -148,6 +151,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework \ ...@@ -148,6 +151,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework \
--exclude=lib/python*/site-packages/flashinfer_jit_cache \ --exclude=lib/python*/site-packages/flashinfer_jit_cache \
--exclude=lib/python*/site-packages/torch \ --exclude=lib/python*/site-packages/torch \
--exclude=lib/python*/site-packages/vllm \ --exclude=lib/python*/site-packages/vllm \
{%- if platform == "amd64" %}
--exclude=lib/python*/site-packages/vllm_omni \
{%- endif %}
--exclude=lib/python*/site-packages/triton \ --exclude=lib/python*/site-packages/triton \
--exclude=lib/python*/site-packages/flashinfer_cubin \ --exclude=lib/python*/site-packages/flashinfer_cubin \
${VIRTUAL_ENV} ${VIRTUAL_ENV} ${VIRTUAL_ENV} ${VIRTUAL_ENV}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment