vllm_runtime.Dockerfile

{#
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#}
# === BEGIN templates/vllm_runtime.Dockerfile ===
##################################################
########## Runtime Image ########################
##################################################
#
# PURPOSE: Production runtime environment
#
# This stage creates a lightweight production-ready image containing:
# - Pre-compiled vLLM and framework dependencies
# - Dynamo runtime libraries and Python packages
# - Essential runtime dependencies and configurations
# - Optimized for inference workloads and deployment
#
# Use this stage when you need:
# - Production deployment of Dynamo with vLLM
# - Minimal runtime footprint without build tools
# - Ready-to-run inference server environment
# - Base for custom application containers
#

FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime

ARG DEVICE
WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"

{% if device == "xpu" %}
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
    add-apt-repository -y ppa:kobuk-team/intel-graphics
{% endif %}

{% if device == "cuda" %}
# Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs
# This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set
ENV CUDA_DEVICE_ORDER=PCI_BUS_ID

# Copy full CUDA toolkit directories from base devel image.
# Avoids cherry-picking individual binaries/libs which breaks when new CUDA deps are introduced.
COPY --from=dynamo_base /usr/local/cuda/bin/ /usr/local/cuda/bin/
COPY --from=dynamo_base /usr/local/cuda/lib64/ /usr/local/cuda/lib64/
COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=dynamo_base /usr/local/cuda/nvvm/ /usr/local/cuda/nvvm/
RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\
    ln -sf /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\
    ln -sf /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so

# DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path
# is not properly set for complilation. Set CPATH to help nvcc find the headers.
ENV CPATH=/usr/local/cuda/include \
    TRITON_CUPTI_PATH=/usr/local/cuda/include \
    TRITON_CUDACRT_PATH=/usr/local/cuda/include \
    TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
    TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
    TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
    TRITON_CUDART_PATH=/usr/local/cuda/include
{% endif %}

### COPY NATS & ETCD ###
# Copy nats and etcd from dev image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/

{% if device == "xpu" %}
ENV PATH=/usr/local/bin/etcd/:$PATH
{% else %}
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
{% endif %}

# Copy uv to system /bin
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/

# Create dynamo user with group 0 for OpenShift compatibility
RUN userdel -r ubuntu > /dev/null 2>&1 || true \
    && useradd -m -s /bin/bash -g 0 dynamo \
    && [ `id -u dynamo` -eq 1000 ] \
    && mkdir -p /home/dynamo/.cache /opt/dynamo \
    # Non-recursive chown - only the directories themselves, not contents
    && chown dynamo:0 /home/dynamo /home/dynamo/.cache /opt/dynamo /workspace \
    # No chmod needed: umask 002 handles new files, COPY --chmod handles copied content
    # Set umask globally for all subsequent RUN commands (must be done as root before USER dynamo)
    # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
    && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh

ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}

# Install Python, build-essential and python3-dev as apt dependencies
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    apt-get update && \
    if [ "$DEVICE" = "cuda" ]; then \
        CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\
        CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1); \
    fi && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        build-essential \
        # jq and curl for polling various endpoints and health checks
        jq \
        git \
        git-lfs \
        # required for verification of GPG keys
        gnupg2 \
        curl \
        # Libraries required by UCX to find RDMA devices
        libibverbs1 rdma-core ibverbs-utils libibumad3 \
        libnuma1 librdmacm1 ibverbs-providers \
        # JIT Kernel Compilation, flashinfer
        ninja-build \
        g++ \
        # prometheus dependencies
        ca-certificates \
        # opencv-python-headless (vLLM dependency) requires libxcb for some functions
        libxcb1 && \
    if [ "$DEVICE" = "cuda" ]; then \
        DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
        cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}; \
    fi && \
    rm -rf /var/lib/apt/lists/*

{% if device == "xpu" %}
RUN apt-get update && \
    apt-get install -y --no-install-recommends --fix-missing \
    #ffmpeg \
    libsndfile1 \
    libsm6 \
    libxext6 \
    libgl1 \
    lsb-release \
    numactl \
    wget \
    vim \
    linux-libc-dev && \
    # Install Intel GPU runtime packages
    apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing \
    intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/intel-oneccl-2021.15.7.8_offline.sh && \
    bash intel-oneccl-2021.15.7.8_offline.sh -a --silent --eula accept && \
    echo "source /opt/intel/oneapi/setvars.sh --force" >> /etc/bash.bashrc && \
    rm -f /opt/intel/oneapi/ccl/latest && \
    ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
{% endif %}

{% if context.vllm.enable_media_ffmpeg == "true" %}
# Copy ffmpeg libraries from wheel_builder (requires root, runs before USER dynamo)
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
    mkdir -p /usr/local/lib/pkgconfig && \
    cp -rnL /tmp/usr/local/include/libav* /tmp/usr/local/include/libsw* /usr/local/include/ && \
    cp -nL /tmp/usr/local/lib/libav*.so /tmp/usr/local/lib/libsw*.so /usr/local/lib/ && \
    cp -nL /tmp/usr/local/lib/pkgconfig/libav*.pc /tmp/usr/local/lib/pkgconfig/libsw*.pc /usr/local/lib/pkgconfig/ && \
    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
{% endif %}

USER dynamo
ENV HOME=/home/dynamo
# This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]

{% if device == "xpu" %}
ENV NIXL_PREFIX=/opt/intel/intel_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/x86_64-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% else %}
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% endif %}

# Site-packages path derived from PYTHON_VERSION ARG
ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages

### VIRTUAL ENVIRONMENT SETUP ###
# Copy virtual environment from framework container, splitting large packages into separate layers
# to enable parallel downloads. Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as
# root because COPY --chmod only affects <path>/*, not <path>
#
# Layer sizes (uncompressed): nvidia=4.5GB, flashinfer_jit_cache=4.1GB, torch=2.1GB,
#                             vllm=1.2GB, triton=592MB, flashinfer_cubin=437MB
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SITE_PACKAGES}/nvidia
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
{% endif %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
{% endif %}
# Remaining packages and venv structure (bin/, include/, share/, etc.)
COPY --chmod=775 --chown=dynamo:0 --from=framework \
    --exclude=lib/python*/site-packages/nvidia \
    --exclude=lib/python*/site-packages/flashinfer_jit_cache \
    --exclude=lib/python*/site-packages/torch \
    --exclude=lib/python*/site-packages/vllm \
{%- if platform == "amd64" %}
    --exclude=lib/python*/site-packages/vllm_omni \
{%- endif %}
    --exclude=lib/python*/site-packages/triton \
    --exclude=lib/python*/site-packages/flashinfer_cubin \
    ${VIRTUAL_ENV} ${VIRTUAL_ENV}

# Copy vllm with correct ownership (read-only, no group-write needed)
COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm

# Copy UCX and NIXL to system directories (read-only, no group-write needed)
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
{% if device == "xpu" %}
{# XPU NIXL uses lib/x86_64-linux-gnu; copy to NIXL_LIB_DIR to ensure lib dir is populated #}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/x86_64-linux-gnu/. ${NIXL_LIB_DIR}/
{% endif %}
{# For cuda: NIXL_LIB_DIR = lib64, already included in the $NIXL_PREFIX COPY above #}
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/


ENV PATH=/usr/local/ucx/bin:$PATH

ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH

{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH
ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}

# TODO: skip /workspace COPYs for dev/local-dev (bind-mounted from host, these get shadowed)
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
{% if target not in ("dev", "local-dev") %}
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
{% endif %}

# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/

{% if target not in ("dev", "local-dev") %}
# Install dynamo, NIXL, and dynamo-specific dependencies
ARG ENABLE_KVBM
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
    export UV_CACHE_DIR=/home/dynamo/.cache/uv && \
    uv pip install \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
    if [ "${ENABLE_KVBM}" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \
            echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
            exit 1; \
        fi; \
        uv pip install "$KVBM_WHEEL"; \
    fi && \
    cd /workspace/benchmarks && \
    export UV_GIT_LFS=1 UV_HTTP_TIMEOUT=300 UV_HTTP_RETRIES=5 && \
    uv pip install . && \
    chmod -R g+w /workspace/benchmarks
{% else %}
# Dev/local-dev: skip dynamo wheel install (users build from source via cargo build + maturin develop).
# Install NIXL wheel only (pre-built C++ binary, not buildable from source).
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
    export UV_CACHE_DIR=/home/dynamo/.cache/uv && \
    uv pip install /opt/dynamo/wheelhouse/nixl/nixl*.whl
{% endif %}

{% if device == "cuda" %}
# Install gpu_memory_service wheel if enabled (all targets)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
        export UV_CACHE_DIR=/home/dynamo/.cache/uv && \
        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
        if [ -n "$GMS_WHEEL" ]; then uv pip install "$GMS_WHEEL"; fi; \
    fi

# Install ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P
ARG MODELEXPRESS_REF
RUN if [ "${ENABLE_MODELEXPRESS_P2P}" = "true" ]; then \
        echo "Installing ModelExpress from ref: ${MODELEXPRESS_REF}" && \
        uv pip install "modelexpress @ git+https://github.com/ai-dynamo/modelexpress.git@${MODELEXPRESS_REF}#subdirectory=modelexpress_client/python"; \
    fi
{% endif %}

# Install runtime dependencies (common + vllm-specific + planner + benchmarks).
# Test and dev dependencies are NOT installed here — they go in the test and dev images.
RUN --mount=type=bind,source=./container/deps/requirements.common.txt,target=/tmp/requirements.common.txt \
    --mount=type=bind,source=./container/deps/requirements.vllm.txt,target=/tmp/requirements.vllm.txt \
    --mount=type=bind,source=./container/deps/requirements.planner.txt,target=/tmp/requirements.planner.txt \
    --mount=type=bind,source=./container/deps/requirements.benchmark.txt,target=/tmp/requirements.benchmark.txt \
    --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
    export UV_CACHE_DIR=/home/dynamo/.cache/uv UV_GIT_LFS=1 UV_HTTP_TIMEOUT=300 UV_HTTP_RETRIES=5 && \
    uv pip install \
        --requirement /tmp/requirements.common.txt \
        --requirement /tmp/requirements.vllm.txt \
        --requirement /tmp/requirements.planner.txt \
        --requirement /tmp/requirements.benchmark.txt

# Copy tests, deploy and components for CI with correct ownership
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/
COPY --chmod=775 --chown=dynamo:0 components/ /workspace/components/
COPY --chmod=775 --chown=dynamo:0 lib/ /workspace/lib/

# Setup launch banner in common directory accessible to all users
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
    sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen

# Setup environment for all users
USER root
# Fix directory permissions: COPY --chmod only affects contents, not the directory itself
RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} && \
    chmod 755 /opt/dynamo/.launch_screen && \
    echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
    echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc

{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/

# Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
# This recreates proper symlinks to save space and suppress ldconfig warnings
RUN cd /usr/local/lib && \
    # libaws-c-common: .so.1 should symlink to .so.1.0.0
    if [ -f libaws-c-common.so.1.0.0 ] && [ ! -L libaws-c-common.so.1 ]; then \
        rm -f libaws-c-common.so.1 libaws-c-common.so && \
        ln -s libaws-c-common.so.1.0.0 libaws-c-common.so.1 && \
        ln -s libaws-c-common.so.1 libaws-c-common.so; \
    fi && \
    # libaws-c-s3: .so.0unstable should symlink to .so.1.0.0
    if [ -f libaws-c-s3.so.1.0.0 ] && [ ! -L libaws-c-s3.so.0unstable ]; then \
        rm -f libaws-c-s3.so.0unstable libaws-c-s3.so && \
        ln -s libaws-c-s3.so.1.0.0 libaws-c-s3.so.0unstable && \
        ln -s libaws-c-s3.so.0unstable libaws-c-s3.so; \
    fi && \
    # libs2n: .so.1 should symlink to .so.1.0.0
    if [ -f libs2n.so.1.0.0 ] && [ ! -L libs2n.so.1 ]; then \
        rm -f libs2n.so.1 libs2n.so && \
        ln -s libs2n.so.1.0.0 libs2n.so.1 && \
        ln -s libs2n.so.1 libs2n.so; \
    fi && \
    # OpenSSL 1.1: check for versioned files (e.g., .so.1.1.1k)
    for lib in libcrypto libssl; do \
        versioned=$(ls -1 ${lib}.so.1.1.* 2>/dev/null | head -1); \
        if [ -n "$versioned" ] && [ ! -L "${lib}.so.1.1" ]; then \
            rm -f "${lib}.so.1.1" && \
            ln -s "$(basename "$versioned")" "${lib}.so.1.1"; \
        fi; \
    done && \
    ldconfig
{% endif %}

USER dynamo

ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA

{% if device == "xpu" %}
RUN uv pip uninstall triton triton-xpu && \
    uv pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/test/xpu && \
    uv pip uninstall oneccl && \
    uv pip uninstall oneccl-devel

SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /etc/bash.bashrc && exec bash"]
{% else %}
# In vLLM 0.12 the default sampler changed on the forward pass.
# We need to enable this to enable the cuda kernels.
ENV VLLM_USE_FLASHINFER_SAMPLER=1
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
{% endif %}