Dockerfile.sglang

# syntax=docker/dockerfile:1.10.0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

ARG CUDA_VERSION=12.9.1

# Runtime image and build-time configuration (aligned with other backends)
# TODO: OPS-<number>: Use the same runtime image as the other backends
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.9.1-cudnn-runtime-ubuntu24.04"

ARG PYTHON_VERSION=3.10
ARG ARCH=amd64
ARG ARCH_ALT=x86_64
ARG CARGO_BUILD_JOBS

# sccache configuration - inherit from base build
ARG USE_SCCACHE
ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION=""

ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base

########################################################
########## Framework Development Image ################
########################################################
#
# PURPOSE: Framework development and SGLang/DeepEP/NVSHMEM compilation
#
# This stage builds and compiles framework dependencies including:
# - SGLang inference engine with CUDA support
# - DeepEP and NVSHMEM
# - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions
#
# Use this stage when you need to:
# - Build SGLang from source with custom modifications
# - Develop or debug framework-level components
# - Create custom builds with specific optimization flags
#
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS framework

# Declare all ARGs
ARG BUILD_TYPE=all
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
ARG DEEPEP_GB_COMMIT=1b14ad661c7640137fcfe93cccb2694ede1220b0
ARG CMAKE_BUILD_PARALLEL_LEVEL=2
ARG FLASHMLA_COMMIT=1408756a88e52a25196b759eaf8db89d2b51b5a1
ARG SGL_KERNEL_VERSION=0.3.15
ARG SGLANG_COMMIT=0.5.4.post1
ARG GDRCOPY_COMMIT=v2.4.4
ARG NVSHMEM_VERSION=3.3.9
ARG GRACE_BLACKWELL=false
ARG ARCH
ARG ARCH_ALT
ARG PYTHON_VERSION
ARG USE_SCCACHE
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
ARG CARGO_BUILD_JOBS
ARG CUDA_VERSION

# Set all environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    TZ=America/Los_Angeles \
    CUDA_HOME=/usr/local/cuda \
    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
    NVSHMEM_DIR=/sgl-workspace/nvshmem/install \
    PATH="${PATH}:/usr/local/nvidia/bin" \
    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" \
    LANG=en_US.UTF-8 \
    LANGUAGE=en_US:en \
    LC_ALL=en_US.UTF-8

# Combined: Python setup, locale, and all package installation
RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa -y \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Python (using other python versions as needed)
        python${PYTHON_VERSION}-dev \
        python${PYTHON_VERSION}-venv \
        python${PYTHON_VERSION}-distutils \
        python3-pip \
        # Build essentials
        build-essential \
        cmake \
        ninja-build \
        ccache \
        patchelf \
        git \
        git-lfs \
        # Core system utilities
        tzdata \
        locales \
        ca-certificates \
        dkms \
        kmod \
        # Command line tools
        wget \
        curl \
        jq \
        unzip \
        # Network utilities
        netcat-openbsd \
        # SSL and pkg-config
        libssl-dev \
        pkg-config \
        # MPI and NUMA
        libopenmpi-dev \
        libnuma1 \
        libnuma-dev \
        numactl \
        # InfiniBand/RDMA
        libibverbs-dev \
        libibverbs1 \
        libibumad3 \
        librdmacm1 \
        libnl-3-200 \
        libnl-route-3-200 \
        libnl-route-3-dev \
        libnl-3-dev \
        ibverbs-providers \
        infiniband-diags \
        perftest \
        # Development libraries
        libgoogle-glog-dev \
        libgtest-dev \
        libjsoncpp-dev \
        libunwind-dev \
        libboost-all-dev \
        libgrpc-dev \
        libgrpc++-dev \
        libprotobuf-dev \
        protobuf-compiler \
        protobuf-compiler-grpc \
        pybind11-dev \
        libhiredis-dev \
        libcurl4-openssl-dev \
        libczmq4 \
        libczmq-dev \
        libfabric-dev \
        # Package building tools
        devscripts \
        debhelper \
        fakeroot \
        check \
        libsubunit0 \
        libsubunit-dev \
    # Set Python alternatives
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python /usr/bin/python${PYTHON_VERSION} \
    # Set up locale
    && locale-gen en_US.UTF-8 \
    # Cleanup
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

# Install sccache if requested
COPY container/use-sccache.sh /tmp/use-sccache.sh
RUN if [ "$USE_SCCACHE" = "true" ]; then \
    /tmp/use-sccache.sh install; \
fi

# Set environment variables - they'll be empty strings if USE_SCCACHE=false
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \
    SCCACHE_S3_KEY_PREFIX=${USE_SCCACHE:+${ARCH}} \
    RUSTC_WRAPPER=${USE_SCCACHE:+sccache} \
    CMAKE_C_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CXX_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CUDA_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache}

WORKDIR /sgl-workspace

# GDRCopy installation
RUN git clone --depth 1 --branch ${GDRCOPY_COMMIT} https://github.com/NVIDIA/gdrcopy.git \
    && cd gdrcopy/packages \
    && export CUDA=${CUDA_HOME} \
    && ./build-deb-packages.sh \
    && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb

# Fix DeepEP IBGDA symlink
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so

# Install SGLang (requires CUDA 12.8.1 or 12.9.1)
RUN python3 -m pip install --no-cache-dir --ignore-installed pip==25.3 setuptools==80.9.0 wheel==0.45.1 html5lib==1.1 six==1.17.0 \
    && git clone --depth 1 --branch v${SGLANG_COMMIT} https://github.com/sgl-project/sglang.git \
    && cd sglang \
    && case "$CUDA_VERSION" in \
        12.8.1) CUINDEX=128 ;; \
        12.9.1) CUINDEX=129 ;; \
        *) echo "Error: Unsupported CUDA version for sglang: $CUDA_VERSION (requires 12.8.1 or 12.9.1)" && exit 1 ;; \
    esac \
    && python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} \
    && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
    && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
    && FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin

# Download and extract NVSHMEM source, clone DeepEP (use Tom's fork for GB200)
RUN --mount=type=cache,target=/var/cache/curl \
    curl --retry 3 --retry-delay 2 -fsSL -o /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
    && tar -xf /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
    && mv nvshmem_src nvshmem \
    && rm -f /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
    && if [ "$GRACE_BLACKWELL" = true ]; then \
        git clone --depth 1 https://github.com/fzyzcjy/DeepEP.git \
        && cd DeepEP \
        && git fetch --depth 1 origin ${DEEPEP_GB_COMMIT} \
        && git checkout ${DEEPEP_GB_COMMIT}; \
    else \
        git clone --depth 1 https://github.com/deepseek-ai/DeepEP.git \
        && cd DeepEP \
        && git fetch --depth 1 origin ${DEEPEP_COMMIT} \
        && git checkout ${DEEPEP_COMMIT}; \
    fi \
    && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh

# Build and install NVSHMEM library only (without python library)
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
    cd /sgl-workspace/nvshmem && \
    if [ "$GRACE_BLACKWELL" = true ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
    NVSHMEM_SHMEM_SUPPORT=0 \
    NVSHMEM_UCX_SUPPORT=0 \
    NVSHMEM_USE_NCCL=0 \
    NVSHMEM_MPI_SUPPORT=0 \
    NVSHMEM_IBGDA_SUPPORT=1 \
    NVSHMEM_PMIX_SUPPORT=0 \
    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
    NVSHMEM_USE_GDRCOPY=1 \
    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} -DNVSHMEM_BUILD_PYTHON_LIB=OFF && \
    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} && \
    /tmp/use-sccache.sh show-stats "NVSHMEM"

# Build nvshmem4py wheels separately (Python 3.10, CUDA 12) to avoid building the python library twice for multiple python versions
# Need to reconfigure with PYTHON_LIB=ON to add the nvshmem4py subdirectory
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
    cd /sgl-workspace/nvshmem && \
    if [ "$GRACE_BLACKWELL" = true ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
    NVSHMEM_SHMEM_SUPPORT=0 \
    NVSHMEM_UCX_SUPPORT=0 \
    NVSHMEM_USE_NCCL=0 \
    NVSHMEM_MPI_SUPPORT=0 \
    NVSHMEM_IBGDA_SUPPORT=1 \
    NVSHMEM_PMIX_SUPPORT=0 \
    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
    NVSHMEM_USE_GDRCOPY=1 \
    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} -DNVSHMEM_BUILD_PYTHON_LIB=ON && \
    cmake --build build --target build_nvshmem4py_wheel_cu12_${PYTHON_VERSION} -j${CMAKE_BUILD_PARALLEL_LEVEL} && \
    /tmp/use-sccache.sh show-stats "NVSHMEM4PY"

# Install DeepEP
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
    cd /sgl-workspace/DeepEP && \
    NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="9.0;10.0" pip install --no-build-isolation .

# Install flashmla
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
    if [ "${ARCH}" = "amd64" ]; then \
        git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla \
        && cd flash-mla \
        && git checkout ${FLASHMLA_COMMIT} \
        && git submodule update --init --recursive \
        && export FLASH_MLA_DISABLE_SM100=1 \
        && pip install --no-build-isolation -v . ;\
    fi

# Copy rust installation from dynamo_base to avoid duplication efforts
COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo

ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/workspace/target \
    PATH=/usr/local/cargo/bin:$PATH \
    CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}

# Install essential Python build tools
RUN python3 -m pip install --no-cache-dir \
    mooncake-transfer-engine==0.3.6.post1 \
    scikit-build-core==0.11.6 \
    setuptools-rust==1.12.0

# Build and install sgl-router
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} \
    && cd /sgl-workspace/sglang/sgl-router \
    && cargo build --release \
    && python3 -m pip install --no-cache-dir .

##################################################
########## Runtime Image ########################
##################################################
#
# PURPOSE: Production runtime environment
#
# This stage creates a production-ready image containing:
# - Pre-compiled SGLang, DeepEP, and NVSHMEM components
# - Dynamo runtime libraries and Python packages
# - Essential runtime dependencies and configurations
# - Optimized for inference workloads and deployment
#
# Use this stage when you need:
# - Production deployment of Dynamo with SGLang + DeepEP
# - Minimal runtime footprint without build tools
# - Ready-to-run inference server environment
#
FROM framework AS runtime

WORKDIR /workspace

ARG ARCH
ARG ARCH_ALT
ARG PYTHON_VERSION

ENV DYNAMO_HOME=/opt/dynamo
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=${NIXL_PREFIX}/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=${NIXL_LIB_DIR}/plugins
ENV LD_LIBRARY_PATH=\
${NVSHMEM_DIR}/lib:\
${NIXL_LIB_DIR}:\
${NIXL_PLUGIN_DIR}:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
/usr/local/nvidia/lib64:\
${LD_LIBRARY_PATH}

# Copy NATS and ETCD from dynamo_base, and UCX/NIXL
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx
COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH

# Install Dynamo wheels from dynamo_base wheelhouse
COPY benchmarks/ /opt/dynamo/benchmarks/
COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN pip install \
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl \
    && cd /opt/dynamo/benchmarks \
    && pip install --no-cache . \
    && cd - \
    && rm -rf /opt/dynamo/benchmarks

# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
    pip install \
        --no-cache \
        --requirement /tmp/requirements.txt \
        --requirement /tmp/requirements.test.txt

## Copy attribution files and launch banner
COPY ATTRIBUTION* LICENSE /workspace/
COPY container/launch_message.txt /workspace/launch_message.txt
RUN sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
    echo "cat ~/.launch_screen" >> ~/.bashrc

# Copy tests, benchmarks, deploy and components for CI
COPY tests /workspace/tests
COPY examples /workspace/examples
COPY benchmarks /workspace/benchmarks
COPY deploy /workspace/deploy
COPY components/ /workspace/components/

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []

###########################################################
########## Development (run.sh, runs as root user) ########
###########################################################
#
# PURPOSE: Local development environment for use with run.sh (not Dev Container plug-in)
#
# This stage runs as root and provides:
# - Development tools and utilities for local debugging
# - Support for vscode/cursor development outside the Dev Container plug-in
#
# Use this stage if you need a full-featured development environment with extra tools,
# but do not use it with the Dev Container plug-in.

FROM runtime AS dev

ARG WORKSPACE_DIR=/sgl-workspace/dynamo
ARG PYTHON_VERSION

# NOTE: SGLang uses system Python (not a virtualenv in framework/runtime stages) to align with
# upstream SGLang Dockerfile: https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile
# For dev stage, we create a lightweight venv with --system-site-packages to satisfy maturin develop
# requirements while still accessing all system-installed packages (sglang, torch, deepep, etc.)
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir -p /opt/dynamo/venv && \
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION --system-site-packages

ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"

# Install development tools and utilities
RUN apt-get update -y && \
    apt-get install -y --no-install-recommends  \
    # System monitoring and debugging tools
    nvtop \
    htop \
    gdb \
    # Network and system utilities
    wget \
    iproute2 \
    net-tools \
    openssh-client \
    rsync \
    lsof \
    # File and archive utilities
    zip \
    tree \
    # Development and build tools
    vim \
    tmux \
    git \
    git-lfs \
    autoconf \
    automake \
    cmake \
    libtool \
    meson \
    bear \
    ccache \
    less \
    # Language and development support
    clang \
    libclang-dev \
    # Shell and productivity tools
    zsh \
    silversearcher-ag \
    cloc \
    locales \
    # NVIDIA tools dependencies
    gnupg && \
    echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list && \
    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
    apt-get update -y && \
    apt-get install -y nsight-systems-cli && \
    rm -rf /var/lib/apt/lists/*

# Install clang-format and clangd
RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
    && chmod +x /usr/local/bin/clang-format \
    && curl --retry 3 --retry-delay 2 -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
    && unzip clangd.zip \
    && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \
    && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \
    && rm -rf clangd_18.1.3 clangd.zip

# Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/
RUN pip install --no-deps -e .

# Install Python development packages
RUN pip install --no-cache-dir \
    maturin[patchelf] \
    pytest \
    black \
    isort \
    icdiff \
    scikit_build_core \
    uv \
    pre-commit \
    pandas \
    matplotlib \
    tabulate

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []