Dockerfile.vllm

# syntax=docker/dockerfile:1.10.0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# This section contains build arguments that are common and shared with
# the plain Dockerfile, so they should NOT have a default. The source of truth is from build.sh.
ARG BASE_IMAGE
ARG BASE_IMAGE_TAG

ARG PYTHON_VERSION
ARG ENABLE_KVBM

ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9"

# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.11.2"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.2"

# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
ARG LMCACHE_REF="0.3.9.post2"

# sccache configuration - inherit from base build
ARG USE_SCCACHE
ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION=""

# Define general architecture ARGs for supporting both x86 and aarch64 builds.
#   ARCH: Used for package suffixes (e.g., amd64, arm64)
#   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
#
# Default values are for x86/amd64:
#   --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
#
# For arm64/aarch64, build with:
#   --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
#
# NOTE: There isn't an easy way to define one of these values based on the other value
# without adding if statements everywhere, so just define both as ARGs for now.
ARG ARCH=amd64
ARG ARCH_ALT=x86_64

ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base

# Copy cuda tools and libs from base image
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base

########################################################
########## Framework Development Image ################
########################################################
#
# PURPOSE: Framework development and vLLM compilation
#
# This stage builds and compiles framework dependencies including:
# - vLLM inference engine with CUDA support
# - DeepGEMM and FlashInfer optimizations
# - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions
#
# Use this stage when you need to:
# - Build vLLM from source with custom modifications
# - Develop or debug framework-level components
# - Create custom builds with specific optimization flags
#

# Use dynamo base image (see /container/Dockerfile for more details)
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework

COPY --from=dynamo_base /bin/uv /bin/uvx /bin/

ARG PYTHON_VERSION

RUN apt-get update -y \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        build-essential \
        # vLLM build dependencies
        cmake \
        ibverbs-providers \
        ibverbs-utils \
        libibumad-dev \
        libibverbs-dev \
        libnuma-dev \
        librdmacm-dev \
        rdma-core \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# if libmlx5.so not shipped with 24.04 rdma-core packaging, CMAKE will fail when looking for
# generic dev name .so so we symlink .s0.1 -> .so
RUN ln -sf /usr/lib/aarch64-linux-gnu/libmlx5.so.1 /usr/lib/aarch64-linux-gnu/libmlx5.so || true

# Create virtual environment
RUN mkdir -p /opt/dynamo/venv && \
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION

# Activate virtual environment
ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"

ARG ARCH
# Install vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF
ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG LMCACHE_REF
ARG CUDA_VERSION

ARG MAX_JOBS=16
ENV MAX_JOBS=$MAX_JOBS
ENV CUDA_HOME=/usr/local/cuda

# Install sccache if requested
COPY container/use-sccache.sh /tmp/use-sccache.sh
# Install sccache if requested
ARG USE_SCCACHE
ARG ARCH_ALT
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION

ENV ARCH_ALT=${ARCH_ALT}
RUN if [ "$USE_SCCACHE" = "true" ]; then \
        /tmp/use-sccache.sh install; \
    fi

# Set environment variables - they'll be empty strings if USE_SCCACHE=false
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \
    CMAKE_C_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CXX_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CUDA_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache}
# Install VLLM and related dependencies
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
        cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
        chmod +x /tmp/install_vllm.sh && \
        /tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION && \
        /tmp/use-sccache.sh show-stats "vLLM";

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH

##################################################
########## Runtime Image ########################
##################################################
#
# PURPOSE: Production runtime environment
#
# This stage creates a lightweight production-ready image containing:
# - Pre-compiled vLLM and framework dependencies
# - Dynamo runtime libraries and Python packages
# - Essential runtime dependencies and configurations
# - Optimized for inference workloads and deployment
#
# Use this stage when you need:
# - Production deployment of Dynamo with vLLM
# - Minimal runtime footprint without build tools
# - Ready-to-run inference server environment
# - Base for custom application containers
#

FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime

WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs
# This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set
ENV CUDA_DEVICE_ORDER=PCI_BUS_ID

# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
COPY --from=base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
COPY --from=base /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas
COPY --from=base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
COPY --from=base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
RUN ln -s /usr/local/cuda/lib64/libcublas.so.12 /usr/local/cuda/lib64/libcublas.so
RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.12 /usr/local/cuda/lib64/libcublasLt.so

# DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path
# is not properly set for complilation. Set CPATH to help nvcc find the headers.
ENV CPATH=/usr/local/cuda/include

### COPY NATS & ETCD ###
# Copy nats and etcd from dev image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH

# Copy uv to system /bin
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/

# Create dynamo user with group 0 for OpenShift compatibility
RUN userdel -r ubuntu > /dev/null 2>&1 || true \
    && useradd -m -s /bin/bash -g 0 dynamo \
    && [ `id -u dynamo` -eq 1000 ] \
    && mkdir -p /home/dynamo/.cache /opt/dynamo \
    && chown -R dynamo: /workspace /home/dynamo /opt/dynamo \
    && chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo

ARG ARCH_ALT
ARG PYTHON_VERSION

# Install Python, build-essential and python3-dev as apt dependencies
RUN apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        build-essential \
        # jq and curl for polling various endpoints and health checks
        jq \
        git \
        git-lfs \
        curl \
        # Libraries required by UCX to find RDMA devices
        libibverbs1 rdma-core ibverbs-utils libibumad3 \
        libnuma1 librdmacm1 ibverbs-providers \
        # JIT Kernel Compilation, flashinfer
        ninja-build \
        g++ \
        # prometheus dependencies
        ca-certificates \
        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
        cuda-command-line-tools-12-9 && \
    rm -rf /var/lib/apt/lists/*

USER dynamo
ENV HOME=/home/dynamo
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins

### VIRTUAL ENVIRONMENT SETUP ###
# Copy entire virtual environment from framework container with correct ownership
COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}

# Copy vllm with correct ownership
COPY --chown=dynamo: --from=framework /opt/vllm /opt/vllm

# Copy UCX and NIXL to system directories
COPY --chown=dynamo: --from=dynamo_base /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX
ENV PATH=/usr/local/ucx/bin:$PATH

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH

# Copy local files
COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/
COPY --chown=dynamo: benchmarks/ /workspace/benchmarks/

# Install dynamo, NIXL, and dynamo-specific dependencies
ARG ENABLE_KVBM
COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl \
    && if [ "${ENABLE_KVBM}" = "true" ]; then \
        uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
       fi \
    && cd /workspace/benchmarks \
    && UV_GIT_LFS=1 uv pip install --no-cache .

# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
    UV_GIT_LFS=1 uv pip install \
        --no-cache \
        --requirement /tmp/requirements.txt \
        --requirement /tmp/requirements.test.txt

# Copy tests, benchmarks, deploy and components for CI
COPY --chown=dynamo: tests /workspace/tests
COPY --chown=dynamo: examples /workspace/examples
COPY --chown=dynamo: deploy /workspace/deploy
COPY --chown=dynamo: recipes/ /workspace/recipes/
COPY --chown=dynamo: components/ /workspace/components/
COPY --chown=dynamo: lib/ /workspace/lib/

# Setup launch banner in common directory accessible to all users
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
    sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen

# Setup environment for all users
USER root
RUN chmod 755 /opt/dynamo/.launch_screen && \
    echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
    echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc

USER dynamo
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []

###########################################################
########## Development (run.sh, runs as root user) ########
###########################################################
#
# PURPOSE: Local development environment for use with run.sh (not Dev Container plug-in)
#
# This stage runs as root and provides:
# - Development tools and utilities for local debugging
# - Support for vscode/cursor development outside the Dev Container plug-in
#
# Use this stage if you need a full-featured development environment with extra tools,
# but do not use it with the Dev Container plug-in.

FROM runtime AS dev

# Don't want ubuntu to be editable, just change uid and gid.
ARG WORKSPACE_DIR=/workspace

USER root
# Install utilities as root
RUN apt-get update -y && \
    apt-get install -y --no-install-recommends  \
    # Install utilities
    nvtop \
    wget \
    tmux \
    vim \
    git \
    openssh-client \
    iproute2 \
    rsync \
    zip \
    unzip \
    htop \
    # Build Dependencies
    autoconf \
    automake \
    cmake \
    libtool \
    meson \
    net-tools \
    pybind11-dev \
    # Rust build dependencies
    clang \
    libclang-dev \
    protobuf-compiler && \
    rm -rf /var/lib/apt/lists/*

# Set workspace directory variable
ENV WORKSPACE_DIR=${WORKSPACE_DIR} \
    DYNAMO_HOME=${WORKSPACE_DIR} \
    RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/workspace/target \
    VIRTUAL_ENV=/opt/dynamo/venv \
    PATH=/usr/local/cargo/bin:$PATH

COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo

# Install maturin, for maturin develop
# Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/
RUN uv pip install maturin[patchelf] && \
    uv pip install --no-deps -e .

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []