Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
e94f4647
Unverified
Commit
e94f4647
authored
Feb 24, 2026
by
Ayush Agarwal
Committed by
GitHub
Feb 24, 2026
Browse files
chore: install vllm-omni in vllm container (#6458)
Signed-off-by:
ayushag
<
ayushag@nvidia.com
>
parent
efa89448
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
73 additions
and
32 deletions
+73
-32
components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
+32
-25
components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+8
-4
container/context.yaml
container/context.yaml
+1
-0
container/deps/vllm/install_vllm.sh
container/deps/vllm/install_vllm.sh
+23
-3
container/templates/args.Dockerfile
container/templates/args.Dockerfile
+1
-0
container/templates/vllm_framework.Dockerfile
container/templates/vllm_framework.Dockerfile
+2
-0
container/templates/vllm_runtime.Dockerfile
container/templates/vllm_runtime.Dockerfile
+6
-0
No files found.
components/src/dynamo/vllm/tests/test_vllm_omni_handler.py
View file @
e94f4647
...
...
@@ -5,16 +5,11 @@ from unittest.mock import MagicMock, patch
import
pytest
from
dynamo.common.protocols.image_protocol
import
NvCreateImageRequest
from
dynamo.common.protocols.video_protocol
import
NvCreateVideoRequest
from
dynamo.common.utils.output_modalities
import
RequestType
try
:
from
dynamo.vllm.omni.omni_handler
import
(
EngineInputs
,
OmniHandler
,
prepare_image_output
,
)
from
dynamo.common.protocols.image_protocol
import
NvCreateImageRequest
from
dynamo.common.protocols.video_protocol
import
NvCreateVideoRequest
from
dynamo.common.utils.output_modalities
import
RequestType
from
dynamo.vllm.omni.omni_handler
import
EngineInputs
,
OmniHandler
except
ImportError
:
pytest
.
skip
(
"vLLM omni dependencies not available"
,
allow_module_level
=
True
)
...
...
@@ -51,32 +46,40 @@ class TestEngineInputs:
class
TestPrepareImageOutput
:
def
test_b64_json
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_b64_json
(
self
):
"""b64_json format returns data URI with base64 prefix."""
handler
=
_make_handler
()
img
=
MagicMock
()
img
.
save
=
lambda
b
,
format
:
b
.
write
(
b
"fake_png_data"
)
results
=
prepare_image_output
([
img
],
"b64_json"
)
results
=
await
handler
.
_
prepare_image_output
([
img
],
"req-1"
,
"b64_json"
)
assert
len
(
results
)
==
1
assert
results
[
0
].
startswith
(
"data:image/png;base64,"
)
def
test_b64_default_when_none
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_b64_default_when_none
(
self
):
"""None response_format defaults to base64 encoding."""
handler
=
_make_handler
()
img
=
MagicMock
()
img
.
save
=
lambda
b
,
format
:
b
.
write
(
b
"data"
)
results
=
prepare_image_output
([
img
],
None
)
results
=
await
handler
.
_
prepare_image_output
([
img
],
"req-1"
,
None
)
assert
results
[
0
].
startswith
(
"data:image/png;base64,"
)
def
test_invalid_format
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_invalid_format
(
self
):
"""Unsupported response_format raises ValueError."""
handler
=
_make_handler
()
with
pytest
.
raises
(
ValueError
,
match
=
"Invalid response format"
):
prepare_image_output
([
MagicMock
()],
"invalid"
)
await
handler
.
_
prepare_image_output
([
MagicMock
()],
"req-1"
,
"invalid"
)
def
test_multiple_images
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_multiple_images
(
self
):
"""Multiple input images produce one output entry each."""
handler
=
_make_handler
()
imgs
=
[
MagicMock
()
for
_
in
range
(
3
)]
for
img
in
imgs
:
img
.
save
=
lambda
b
,
format
:
b
.
write
(
b
"px"
)
results
=
prepare_image_output
(
imgs
,
"b64_json"
)
results
=
await
handler
.
_
prepare_image_output
(
imgs
,
"req-1"
,
"b64_json"
)
assert
len
(
results
)
==
3
...
...
@@ -160,23 +163,25 @@ class TestFormatTextChunk:
class
TestFormatImageChunk
:
def
test_chat_completion_format
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_format
(
self
):
"""Chat completion route returns image_url content parts."""
handler
=
_make_handler
()
img
=
MagicMock
()
img
.
save
=
lambda
b
,
format
:
b
.
write
(
b
"px"
)
chunk
=
handler
.
_format_image_chunk
(
chunk
=
await
handler
.
_format_image_chunk
(
[
img
],
"req-1"
,
request_type
=
RequestType
.
CHAT_COMPLETION
)
assert
chunk
[
"object"
]
==
"chat.completion.chunk"
assert
chunk
[
"choices"
][
0
][
"delta"
][
"content"
][
0
][
"type"
]
==
"image_url"
def
test_image_generation_b64_format
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_image_generation_b64_format
(
self
):
"""Image generation with b64_json format returns base64 data."""
handler
=
_make_handler
()
img
=
MagicMock
()
img
.
save
=
lambda
b
,
format
:
b
.
write
(
b
"px"
)
chunk
=
handler
.
_format_image_chunk
(
chunk
=
await
handler
.
_format_image_chunk
(
[
img
],
"req-1"
,
response_format
=
"b64_json"
,
...
...
@@ -184,12 +189,13 @@ class TestFormatImageChunk:
)
assert
chunk
[
"data"
][
0
][
"b64_json"
]
is
not
None
def
test_image_generation_default_format_returns_b64
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_image_generation_default_format_returns_b64
(
self
):
"""Image generation with response_format=None defaults to b64_json."""
handler
=
_make_handler
()
img
=
MagicMock
()
img
.
save
=
lambda
b
,
format
:
b
.
write
(
b
"px"
)
chunk
=
handler
.
_format_image_chunk
(
chunk
=
await
handler
.
_format_image_chunk
(
[
img
],
"req-1"
,
response_format
=
None
,
...
...
@@ -197,10 +203,11 @@ class TestFormatImageChunk:
)
assert
chunk
[
"data"
][
0
][
"b64_json"
]
is
not
None
def
test_empty_images_returns_error
(
self
):
@
pytest
.
mark
.
asyncio
async
def
test_empty_images_returns_error
(
self
):
"""Empty image list produces an error chunk."""
handler
=
_make_handler
()
chunk
=
handler
.
_format_image_chunk
([],
"req-1"
)
chunk
=
await
handler
.
_format_image_chunk
([],
"req-1"
)
assert
"Error"
in
chunk
[
"choices"
][
0
][
"delta"
][
"content"
]
...
...
components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
View file @
e94f4647
...
...
@@ -358,7 +358,7 @@ class TestVllmRendererApi:
position. vllm_processor.py constructs EngineCoreOutput by keyword
and reads fields from EngineCoreRequest positionally.
"""
expected
_request_fields
=
(
base
_request_fields
=
(
"request_id"
,
"prompt_token_ids"
,
"mm_features"
,
...
...
@@ -377,10 +377,14 @@ class TestVllmRendererApi:
"resumable"
,
"external_req_id"
,
)
# vllm-omni monkey-patches EngineCoreRequest with an extra field
# (only installed on amd64, not arm64)
omni_fields
=
base_request_fields
+
(
"additional_information"
,)
actual_request_fields
=
EngineCoreRequest
.
__struct_fields__
assert
actual_request_fields
==
expected
_request_fields
,
(
assert
actual_request_fields
in
(
base
_request_fields
,
omni_fields
),
(
"EngineCoreRequest fields changed!
\n
"
f
"Expected:
{
expected_request_fields
}
\n
"
f
"Expected (base):
{
base_request_fields
}
\n
"
f
"Expected (omni):
{
omni_fields
}
\n
"
f
"Actual:
{
actual_request_fields
}
\n
"
"Update request construction in components/src/dynamo/frontend/vllm_processor.py"
)
...
...
container/context.yaml
View file @
e94f4647
...
...
@@ -45,6 +45,7 @@ vllm:
vllm_ref
:
v0.15.1
flashinf_ref
:
v0.6.1
lmcache_ref
:
0.3.13
vllm_omni_ref
:
"
0.14.0"
max_jobs
:
"
10"
enable_media_ffmpeg
:
"
true"
enable_gpu_memory_service
:
"
true"
...
...
container/deps/vllm/install_vllm.sh
View file @
e94f4647
...
...
@@ -6,8 +6,9 @@
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
# 3. DeepGEMM
# 4. EP kernels
# 3. vLLM-Omni
# 4. DeepGEMM
# 5. EP kernels
set
-euo
pipefail
...
...
@@ -25,6 +26,7 @@ DEEPGEMM_REF=""
CUDA_VERSION
=
"12.9"
FLASHINF_REF
=
"v0.6.1"
LMCACHE_REF
=
"0.3.13"
VLLM_OMNI_REF
=
"0.14.0"
while
[[
$#
-gt
0
]]
;
do
case
$1
in
...
...
@@ -56,6 +58,10 @@ while [[ $# -gt 0 ]]; do
LMCACHE_REF
=
"
$2
"
shift
2
;;
--vllm-omni-ref
)
VLLM_OMNI_REF
=
"
$2
"
shift
2
;;
--torch-cuda-arch-list
)
TORCH_CUDA_ARCH_LIST
=
"
$2
"
shift
2
...
...
@@ -65,7 +71,7 @@ while [[ $# -gt 0 ]]; do
shift
2
;;
-h
|
--help
)
echo
"Usage:
$0
[--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo
"Usage:
$0
[--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF]
[--vllm-omni-ref REF]
[--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo
"Options:"
echo
" --vllm-ref REF vLLM release version (default:
${
VLLM_REF
}
)"
echo
" --max-jobs NUM Maximum parallel jobs (default:
${
MAX_JOBS
}
)"
...
...
@@ -74,6 +80,7 @@ while [[ $# -gt 0 ]]; do
echo
" --deepgemm-ref REF DeepGEMM git ref (default:
${
DEEPGEMM_REF
}
)"
echo
" --flashinf-ref REF FlashInfer version (default:
${
FLASHINF_REF
}
)"
echo
" --lmcache-ref REF LMCache version (default:
${
LMCACHE_REF
}
)"
echo
" --vllm-omni-ref REF vLLM-Omni version (default:
${
VLLM_OMNI_REF
}
)"
echo
" --torch-cuda-arch-list LIST CUDA architectures (default:
${
TORCH_CUDA_ARCH_LIST
}
)"
echo
" --cuda-version VERSION CUDA version (default:
${
CUDA_VERSION
}
)"
exit
0
...
...
@@ -160,6 +167,19 @@ else
fi
echo
"✓ vLLM installation completed"
echo
"
\n
=== Installing vLLM-Omni ==="
if
[
-n
"
$VLLM_OMNI_REF
"
]
&&
[
"
$ARCH
"
=
"amd64"
]
;
then
# Save original vllm entrypoint before vllm-omni overwrites it
VLLM_BIN
=
$(
which vllm
)
cp
"
$VLLM_BIN
"
/tmp/vllm-entrypoint-backup
uv pip
install
vllm-omni
==
${
VLLM_OMNI_REF
}
# Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
cp
/tmp/vllm-entrypoint-backup
"
$VLLM_BIN
"
echo
"✓ vLLM-Omni
${
VLLM_OMNI_REF
}
installed (original vllm entrypoint preserved)"
else
echo
"⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
echo
"
\n
=== Installing DeepGEMM ==="
cd
$INSTALLATION_DIR
/vllm/tools
if
[
-n
"
$DEEPGEMM_REF
"
]
;
then
...
...
container/templates/args.Dockerfile
View file @
e94f4647
...
...
@@ -71,6 +71,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG
FLASHINF_REF={{ context.vllm.flashinf_ref }}
ARG
LMCACHE_REF={{ context.vllm.lmcache_ref }}
ARG
VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
# If left blank, then we will fallback to vLLM defaults
ARG
DEEPGEMM_REF=""
...
...
container/templates/vllm_framework.Dockerfile
View file @
e94f4647
...
...
@@ -68,6 +68,7 @@ ARG VLLM_GIT_URL
ARG
DEEPGEMM_REF
ARG
FLASHINF_REF
ARG
LMCACHE_REF
ARG
VLLM_OMNI_REF
ARG
CUDA_VERSION
ARG
MAX_JOBS
...
...
@@ -88,6 +89,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
${
DEEPGEMM_REF
:+--deepgemm-ref
"
$DEEPGEMM_REF
"
}
\
${
FLASHINF_REF
:+--flashinf-ref
"
$FLASHINF_REF
"
}
\
${
LMCACHE_REF
:+--lmcache-ref
"
$LMCACHE_REF
"
}
\
${
VLLM_OMNI_REF
:+--vllm-omni-ref
"
$VLLM_OMNI_REF
"
}
\
--cuda-version
$CUDA_VERSION
ENV
LD_LIBRARY_PATH=\
...
...
container/templates/vllm_runtime.Dockerfile
View file @
e94f4647
...
...
@@ -140,6 +140,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SIT
COPY
--chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
COPY
--chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY
--chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%}
COPY
--chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%}
COPY
--chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
COPY
--chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
# Remaining packages and venv structure (bin/, include/, share/, etc.)
...
...
@@ -148,6 +151,9 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework \
--exclude=lib/python*/site-packages/flashinfer_jit_cache \
--exclude=lib/python*/site-packages/torch \
--exclude=lib/python*/site-packages/vllm \
{%- if platform == "amd64" %}
--exclude=lib/python*/site-packages/vllm_omni \
{%- endif %}
--exclude=lib/python*/site-packages/triton \
--exclude=lib/python*/site-packages/flashinfer_cubin \
${VIRTUAL_ENV} ${VIRTUAL_ENV}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment