Unverified Commit 3e0459fb authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: bump sglang to `0.5.6.post2` and swap to upstream runtime container (#4762)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
Co-authored-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent f4245c99
...@@ -164,8 +164,10 @@ runs: ...@@ -164,8 +164,10 @@ runs:
# Run the sanity check script inside the container # Run the sanity check script inside the container
# The script is located in /workspace/deploy/sanity_check.py in runtime containers # The script is located in /workspace/deploy/sanity_check.py in runtime containers
export WORKSPACE=/workspace
set +e set +e
docker run --rm "$IMAGE_TAG" python /workspace/deploy/sanity_check.py --runtime-check --no-gpu-check docker run --rm "$IMAGE_TAG" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check
SANITY_CHECK_EXIT_CODE=$? SANITY_CHECK_EXIT_CODE=$?
set -e set -e
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
......
...@@ -46,7 +46,7 @@ dependencies = [ ...@@ -46,7 +46,7 @@ dependencies = [
"pydantic>=2", "pydantic>=2",
"tabulate", "tabulate",
"types-tabulate", "types-tabulate",
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6 (==4.57.1) # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6.post2 (==4.57.1)
"transformers>=4.56.0,<=4.57.1", "transformers>=4.56.0,<=4.57.1",
"pytest-mypy", "pytest-mypy",
] ]
......
# syntax=docker/dockerfile:1.10.0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
...@@ -19,7 +18,7 @@ ...@@ -19,7 +18,7 @@
# properly without needing slow chown -R operations (which can add 2-10 extra # properly without needing slow chown -R operations (which can add 2-10 extra
# minutes). # minutes).
# #
# DEVELOPMENT PATHS THAT MUST BE GROUP-WRITABLE (for non-virtualenv containers): # DEVELOPMENT PATHS THAT MUST BE GROUP-WRITABLE (for virtualenv containers):
# /workspace - Users create/modify project files # /workspace - Users create/modify project files
# /home/dynamo - Users create config/cache files # /home/dynamo - Users create config/cache files
# /home/dynamo/.local - SGLang uses $HOME/.local/lib/python3.10/site-packages for pip install # /home/dynamo/.local - SGLang uses $HOME/.local/lib/python3.10/site-packages for pip install
...@@ -31,23 +30,19 @@ ...@@ -31,23 +30,19 @@
# This section contains build arguments that are common and shared with # This section contains build arguments that are common and shared with
# the plain Dockerfile, so they should NOT have a default. The source of truth is from build.sh. # the plain Dockerfile, so they should NOT have a default. The source of truth is from build.sh.
ARG BASE_IMAGE ARG BASE_IMAGE
ARG BASE_IMAGE_TAG ARG BASE_IMAGE_TAG
ARG FRAMEWORK_IMAGE
ARG FRAMEWORK_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
ARG CUDA_VERSION
ARG ARCH=amd64 ARG RUNTIME_IMAGE="lmsysorg/sglang"
ARG ARCH_ALT=x86_64 ARG RUNTIME_IMAGE_TAG="v0.5.6.post2-runtime"
# sccache configuration - inherit from base build # SCCACHE configuration
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_BUCKET="" ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION="" ARG SCCACHE_REGION=""
...@@ -58,6 +53,21 @@ ARG NIXL_REF ...@@ -58,6 +53,21 @@ ARG NIXL_REF
ARG NIXL_GDRCOPY_REF ARG NIXL_GDRCOPY_REF
ARG NIXL_LIBFABRIC_REF ARG NIXL_LIBFABRIC_REF
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64)
# ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
#
# Default values are for x86/amd64:
# --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
#
# For arm64/aarch64, build with:
# --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
#
# NOTE: There isn't an easy way to define one of these values based on the other value
# without adding if statements everywhere, so just define both as ARGs for now.
ARG ARCH=amd64
ARG ARCH_ALT=x86_64
################################## ##################################
########## Base Image ############ ########## Base Image ############
################################## ##################################
...@@ -217,8 +227,7 @@ RUN if [ "$USE_SCCACHE" = "true" ]; then \ ...@@ -217,8 +227,7 @@ RUN if [ "$USE_SCCACHE" = "true" ]; then \
# Set SCCACHE environment variables # Set SCCACHE environment variables
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \ SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}}
RUSTC_WRAPPER=${USE_SCCACHE:+sccache}
# Build FFmpeg from source # Build FFmpeg from source
# Do not delete the source tarball for legal reasons # Do not delete the source tarball for legal reasons
...@@ -415,304 +424,41 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -415,304 +424,41 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
######################################################## ##################################
########## Framework Development Image ################ ########## Runtime Image #########
######################################################## ##################################
#
# PURPOSE: Framework development and SGLang/DeepEP/NVSHMEM compilation FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
#
# This stage builds and compiles framework dependencies including: # cleanup unnecessary libs
# - SGLang inference engine with CUDA support RUN apt remove -y python3-apt &&\
# - DeepEP and NVSHMEM pip uninstall -y termplotlib
# - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions # This ARG is still utilized for SGLANG Version extraction
# ARG RUNTIME_IMAGE_TAG
# Use this stage when you need to: WORKDIR /workspace
# - Build SGLang from source with custom modifications
# - Develop or debug framework-level components # Install NATS and ETCD
# - Create custom builds with specific optimization flags COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
# COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
#FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS framework
FROM ${FRAMEWORK_IMAGE}:${FRAMEWORK_IMAGE_TAG} AS framework ENV PATH=/usr/local/bin/etcd:$PATH
# Declare all ARGs
ARG BUILD_TYPE=all
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
ARG DEEPEP_GB_COMMIT=1b14ad661c7640137fcfe93cccb2694ede1220b0
ARG CMAKE_BUILD_PARALLEL_LEVEL=2
ARG SGL_KERNEL_VERSION=0.3.16.post5
ARG SGLANG_COMMIT=0.5.6
ARG GDRCOPY_COMMIT=v2.4.4
ARG NVSHMEM_VERSION=3.3.9
ARG GRACE_BLACKWELL=false
ARG ARCH
ARG ARCH_ALT
ARG PYTHON_VERSION
ARG CARGO_BUILD_JOBS
ARG CUDA_VERSION
# Set all environment variables
ENV DEBIAN_FRONTEND=noninteractive \
TZ=America/Los_Angeles \
CUDA_HOME=/usr/local/cuda \
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
NVSHMEM_DIR=/sgl-workspace/nvshmem/install \
PATH="${PATH}:/usr/local/nvidia/bin" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" \
LANG=en_US.UTF-8 \
LANGUAGE=en_US:en \
LC_ALL=en_US.UTF-8
# Combined: Python setup, locale, and all package installation
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa -y \
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python (using other python versions as needed)
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-distutils \
python3-pip \
# Build essentials
build-essential \
cmake \
ninja-build \
ccache \
patchelf \
git \
git-lfs \
# Core system utilities
tzdata \
locales \
ca-certificates \
dkms \
kmod \
# Command line tools
wget \
curl \
jq \
unzip \
# Network utilities
netcat-openbsd \
# SSL and pkg-config
libssl-dev \
pkg-config \
# MPI and NUMA
libopenmpi-dev \
libnuma1 \
libnuma-dev \
numactl \
# InfiniBand/RDMA
libibverbs-dev \
libibverbs1 \
libibumad3 \
librdmacm1 \
libnl-3-200 \
libnl-route-3-200 \
libnl-route-3-dev \
libnl-3-dev \
ibverbs-providers \
infiniband-diags \
perftest \
# Development libraries
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libunwind-dev \
libboost-all-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler \
protobuf-compiler-grpc \
pybind11-dev \
libhiredis-dev \
libcurl4-openssl-dev \
libczmq4 \
libczmq-dev \
libfabric-dev \
# Package building tools
devscripts \
debhelper \
fakeroot \
check \
libsubunit0 \
libsubunit-dev \
# Set Python alternatives
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python /usr/bin/python${PYTHON_VERSION} \
# Set up locale
&& locale-gen en_US.UTF-8 \
# Cleanup
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
WORKDIR /sgl-workspace
# GDRCopy installation
RUN git clone --depth 1 --branch ${GDRCOPY_COMMIT} https://github.com/NVIDIA/gdrcopy.git \
&& cd gdrcopy/packages \
&& export CUDA=${CUDA_HOME} \
&& ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb
# Fix DeepEP IBGDA symlink
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
# Create dynamo user with group 0 for OpenShift compatibility # Create dynamo user with group 0 for OpenShift compatibility
RUN userdel -r ubuntu > /dev/null 2>&1 || true \ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
&& useradd -m -s /bin/bash -g 0 dynamo \ && useradd -m -s /bin/bash -g 0 dynamo \
&& [ `id -u dynamo` -eq 1000 ] \ && [ `id -u dynamo` -eq 1000 ] \
&& mkdir -p /workspace /home/dynamo/.cache /opt/dynamo \ && mkdir -p /home/dynamo/.cache /opt/dynamo \
# Non-recursive chown - only the directories themselves, not contents # Non-recursive chown - only the directories themselves, not contents
&& chown dynamo:0 /sgl-workspace /workspace /home/dynamo /home/dynamo/.cache /opt/dynamo \ && chown dynamo:0 /home/dynamo /home/dynamo/.cache /opt/dynamo /workspace \
# No chmod needed: umask 002 handles new files, COPY --chmod handles copied content # No chmod needed: umask 002 handles new files, COPY --chmod handles copied content
# Set umask globally for all subsequent RUN commands (must be done as root before USER dynamo) # Set umask globally for all subsequent RUN commands (must be done as root before USER dynamo)
# NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
&& mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
USER dynamo USER dynamo
ENV HOME=/home/dynamo # Copy attribution files
# This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
# Install SGLang (requires CUDA 12.8.1 or 12.9.1). Note that when system-wide packages is not writable,
# so it gets installed to ~/.local/lib/python<version>/site-packages.
RUN python3 -m pip install --no-cache-dir --ignore-installed pip==25.3 setuptools==80.9.0 wheel==0.45.1 html5lib==1.1 six==1.17.0 \
&& git clone --depth 1 --branch v${SGLANG_COMMIT} https://github.com/sgl-project/sglang.git \
&& cd sglang \
&& case "$CUDA_VERSION" in \
12.8.1) CUINDEX=128 ;; \
12.9.1) CUINDEX=129 ;; \
*) echo "Error: Unsupported CUDA version for sglang: $CUDA_VERSION (requires 12.8.1 or 12.9.1)" && exit 1 ;; \
esac \
&& python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} \
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
&& python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
&& FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin
# Download and extract NVSHMEM source, clone DeepEP (use Tom's fork for GB200)
RUN --mount=type=cache,target=/var/cache/curl,uid=1000,gid=0 \
curl --retry 3 --retry-delay 2 -fsSL -o /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& tar -xf /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& mv nvshmem_src nvshmem \
&& rm -f /var/cache/curl/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& if [ "$GRACE_BLACKWELL" = true ]; then \
git clone --depth 1 https://github.com/fzyzcjy/DeepEP.git \
&& cd DeepEP \
&& git fetch --depth 1 origin ${DEEPEP_GB_COMMIT} \
&& git checkout ${DEEPEP_GB_COMMIT}; \
else \
git clone --depth 1 https://github.com/deepseek-ai/DeepEP.git \
&& cd DeepEP \
&& git fetch --depth 1 origin ${DEEPEP_COMMIT} \
&& git checkout ${DEEPEP_COMMIT}; \
fi \
&& sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh
# Build and install NVSHMEM library only (without python library)
RUN cd /sgl-workspace/nvshmem && \
if [ "$GRACE_BLACKWELL" = true ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} -DNVSHMEM_BUILD_PYTHON_LIB=OFF && \
cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
# Build nvshmem4py wheels separately (Python 3.10, CUDA 12) to avoid building the python library twice for multiple python versions
# Need to reconfigure with PYTHON_LIB=ON to add the nvshmem4py subdirectory
RUN cd /sgl-workspace/nvshmem && \
if [ "$GRACE_BLACKWELL" = true ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} -DNVSHMEM_BUILD_PYTHON_LIB=ON && \
cmake --build build --target build_nvshmem4py_wheel_cu12_${PYTHON_VERSION} -j${CMAKE_BUILD_PARALLEL_LEVEL}
# Install DeepEP
RUN cd /sgl-workspace/DeepEP && \
NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="9.0;10.0" pip install --no-build-isolation .
# Copy rust installation from dynamo_base to avoid duplication efforts
# Pattern: COPY --chmod=775 <path>; RUN chmod g+w <path> because COPY --chmod only affects <path>/*, not <path>
COPY --from=dynamo_base --chown=dynamo:0 --chmod=775 /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base --chown=dynamo:0 --chmod=775 /usr/local/cargo /usr/local/cargo
RUN chmod g+w /usr/local/rustup /usr/local/cargo
ENV RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
CARGO_TARGET_DIR=/workspace/target \
PATH=/usr/local/cargo/bin:$PATH \
CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
# Install essential Python build tools
RUN python3 -m pip install --no-cache-dir \
mooncake-transfer-engine==0.3.6.post1 \
scikit-build-core==0.11.6 \
setuptools-rust==1.12.0
##################################################
########## Runtime Image ########################
##################################################
#
# PURPOSE: Production runtime environment
#
# This stage creates a production-ready image containing:
# - Pre-compiled SGLang, DeepEP, and NVSHMEM components
# - Dynamo runtime libraries and Python packages
# - Essential runtime dependencies and configurations
# - Optimized for inference workloads and deployment
#
# Use this stage when you need:
# - Production deployment of Dynamo with SGLang + DeepEP
# - Minimal runtime footprint without build tools
# - Ready-to-run inference server environment
#
FROM framework AS runtime
WORKDIR /workspace
ARG ARCH
ARG ARCH_ALT
ARG PYTHON_VERSION
ENV DYNAMO_HOME=/opt/dynamo
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=${NIXL_PREFIX}/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=${NIXL_LIB_DIR}/plugins
ENV LD_LIBRARY_PATH=\
${NVSHMEM_DIR}/lib:\
${NIXL_LIB_DIR}:\
${NIXL_PLUGIN_DIR}:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
/usr/local/nvidia/lib64:\
${LD_LIBRARY_PATH}
ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
# Copy NATS and ETCD from dynamo_base, and UCX/NIXL from wheel_builder
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:${HOME}/.local/bin:$PATH
# Copy ffmpeg # Copy ffmpeg
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
...@@ -722,60 +468,56 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca ...@@ -722,60 +468,56 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/; \ cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/; \
true # in case ffmpeg not enabled true # in case ffmpeg not enabled
# Install Dynamo wheels from dynamo_base wheelhouse
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 benchmarks/ /opt/dynamo/benchmarks/ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
RUN python3 -m pip install \
ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
RUN --mount=type=bind,source=.,target=/mnt/local_src \
pip install --no-cache-dir --break-system-packages \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \ sglang==${SGLANG_VERSION}
&& cd /opt/dynamo/benchmarks \
&& python3 -m pip install --no-cache . \
&& cd - \
&& rm -rf /opt/dynamo/benchmarks
# Install common and test dependencies # Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=.,target=/mnt/local_src \
--mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ pip install --no-cache-dir --break-system-packages \
python3 -m pip install \ --requirement /mnt/local_src/container/deps/requirements.txt \
--no-cache \ --requirement /mnt/local_src/container/deps/requirements.test.txt \
--requirement /tmp/requirements.txt \ sglang==${SGLANG_VERSION} && \
--requirement /tmp/requirements.test.txt cd /workspace/benchmarks && \
pip install --break-system-packages --no-cache . && \
## Copy attribution files and launch banner with correct ownership # pip/uv bypasses umask when creating .egg-info files, but chmod -R is fast here (small directory)
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/ chmod -R g+w /workspace/benchmarks && \
# Install NVIDIA packages that are needed for DeepEP to work properly
# Copy tests, benchmarks, deploy and components for CI with correct ownership # This is done in the upstream runtime image too, but we overrode these packages earlier
pip install --no-cache-dir --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu12==2.28.3 \
nvidia-cudnn-cu12==9.16.0.29 \
nvidia-cutlass-dsl==4.3.0
# Copy tests, deploy and components for CI with correct ownership
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
COPY --chmod=775 --chown=dynamo:0 benchmarks /workspace/benchmarks
COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
COPY --chmod=775 --chown=dynamo:0 components/ /workspace/components/ COPY --chmod=775 --chown=dynamo:0 components/ /workspace/components/
COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/ COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/
# Setup launch banner in common directory accessible to all users # Enable forceful shutdown of inflight requests
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \ ENV SGLANG_FORCE_SHUTDOWN=1
sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen
# Setup environment for all users # Our scripting assumes /workspace is where dynamo is located
# In order to maintain the ability to have sglang and dynamo
# in the same workspace, symlink /workspace to /sgl-workspace/dynamo
USER root USER root
# Fix directory permissions: COPY --chmod only affects contents, not the directory itself RUN ln -s /workspace /sgl-workspace/dynamo
RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* && \
chown dynamo:0 /workspace /opt/dynamo/ && \
chmod 755 /opt/dynamo/.launch_screen && \
echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
USER dynamo USER dynamo
# Copy tests, benchmarks, deploy and components for CI with correct ownership
ARG DYNAMO_COMMIT_SHA ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA ENV DYNAMO_COMMIT_SHA=${DYNAMO_COMMIT_SHA}
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENV PATH=/home/dynamo/.local/bin:$PATH
CMD []
########################################################### ###########################################################
########## Development (run.sh, runs as root user) ######## ########## Development (run.sh, runs as root user) ########
...@@ -792,98 +534,63 @@ CMD [] ...@@ -792,98 +534,63 @@ CMD []
FROM runtime AS dev FROM runtime AS dev
ARG WORKSPACE_DIR=/sgl-workspace/dynamo # Don't want ubuntu to be editable, just change uid and gid.
ARG PYTHON_VERSION ARG WORKSPACE_DIR=/workspace
# NOTE: SGLang uses system Python (not a virtualenv in framework/runtime stages) to align with
# upstream SGLang Dockerfile: https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile
# For dev stage, we create a lightweight venv with --system-site-packages to satisfy maturin develop
# requirements while still accessing all system-installed packages (sglang, torch, deepep, etc.)
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION --system-site-packages
ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}"
USER root USER root
# venv permissions are handled by umask 002 set earlier # Install utilities as root
# Install development tools and utilities
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
# System monitoring and debugging tools # Install utilities
nvtop \ nvtop \
htop \
gdb \
# Network and system utilities
wget \ wget \
iproute2 \ tmux \
net-tools \ vim \
git \
openssh-client \ openssh-client \
iproute2 \
rsync \ rsync \
lsof \
# File and archive utilities
zip \ zip \
tree \ unzip \
# Development and build tools htop \
vim \ # Build Dependencies
tmux \
git \
git-lfs \
autoconf \ autoconf \
automake \ automake \
cmake \ cmake \
libtool \ libtool \
meson \ meson \
bear \ net-tools \
ccache \ pybind11-dev \
less \ # Rust build dependencies
pkg-config \
# Language and development support
clang \ clang \
libclang-dev \ libclang-dev \
# Shell and productivity tools protobuf-compiler \
zsh \ pkg-config && \
silversearcher-ag \
cloc \
locales \
# sudo for dev stage
sudo \
# NVIDIA tools dependencies
gnupg && \
echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
apt-get update -y && \
apt-get install -y nsight-systems-cli && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Install clang-format and clangd # Set umask for group-writable files in dev stage (runs as root)
RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \ RUN mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
&& chmod +x /usr/local/bin/clang-format \ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
&& curl --retry 3 --retry-delay 2 -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
&& unzip clangd.zip \ # Set workspace directory variable
&& cp -r clangd_18.1.3/bin/* /usr/local/bin/ \ ENV WORKSPACE_DIR=${WORKSPACE_DIR} \
&& cp -r clangd_18.1.3/lib/* /usr/local/lib/ \ DYNAMO_HOME=${WORKSPACE_DIR} \
&& rm -rf clangd_18.1.3 clangd.zip RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
CARGO_TARGET_DIR=/workspace/target \
PATH=/usr/local/cargo/bin:$PATH
# Copy rust installation from dynamo_base to avoid duplication efforts
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> because COPY --chmod only affects <path>/*, not <path>
COPY --from=dynamo_base --chmod=775 /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base --chmod=775 /usr/local/cargo /usr/local/cargo
RUN chmod g+w /usr/local/rustup /usr/local/cargo
# Install maturin, for maturin develop
# Editable install of dynamo # Editable install of dynamo
COPY --chmod=664 pyproject.toml README.md hatch_build.py /workspace/ COPY pyproject.toml README.md hatch_build.py /workspace/
RUN python3 -m pip install --no-deps -e . RUN pip install maturin[patchelf] && \
pip install --no-deps -e .
# Install Python development packages
RUN python3 -m pip install --no-cache-dir \
maturin[patchelf] \
pytest \
black \
isort \
icdiff \
scikit_build_core \
uv \
pre-commit \
pandas \
matplotlib \
tabulate
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
ARG SGLANG_IMAGE_TAG="v0.5.3.post2"
ARG BRANCH_TYPE
ARG CARGO_BUILD_JOBS
FROM scratch AS local_src
COPY . /src
FROM lmsysorg/sglang:${SGLANG_IMAGE_TAG}
WORKDIR /sgl-workspace
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
# Install jq for JSON processing
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
jq \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install dynamo
# Providing --build-arg BRANCH_TYPE=local will editable install the local dynamo repo
# Providing --build-arg BRANCH_TYPE=remote will editable install the remote dynamo repo
# Default is to install the latest published dynamo version
ARG BRANCH_TYPE
ARG CARGO_BUILD_JOBS
COPY --from=local_src /src /tmp/local_src
RUN if [ "$BRANCH_TYPE" = "local" ]; then \
cp -r /tmp/local_src /sgl-workspace/dynamo; \
elif [ "$BRANCH_TYPE" = "remote" ]; then \
git clone https://github.com/ai-dynamo/dynamo.git /sgl-workspace/dynamo; \
fi
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
# SGLang does not use a venv in their container
RUN if [ "$BRANCH_TYPE" = "local" ]; then \
cd dynamo/lib/bindings/python && \
pip install --break-system-packages maturin && \
maturin build --release && \
pip install --break-system-packages target/wheels/*.whl && \
cd /sgl-workspace/dynamo && \
pip install --break-system-packages -e . && \
pip install --break-system-packages --requirement /tmp/local_src/container/deps/requirements.txt ; \
elif [ "$BRANCH_TYPE" = "remote" ]; then \
cd dynamo/lib/bindings/python && \
pip install --break-system-packages maturin && \
maturin build --release && \
pip install --break-system-packages target/wheels/*.whl && \
cd /sgl-workspace/dynamo && \
pip install --break-system-packages -e . && \
pip install --break-system-packages --requirement /sgl-workspace/dynamo/container/deps/requirements.txt ; \
else \
pip install --break-system-packages ai-dynamo ; \
fi \
&& rm -rf /tmp/local_src
# Install NATS and ETCD
RUN case "$(uname -m)" in \
x86_64) ARCH=amd64 ;; \
aarch64) ARCH=arm64 ;; \
*) echo "Unsupported architecture: $(uname -m)" && exit 1 ;; \
esac && \
wget --tries=3 --waitretry=5 \
https://github.com/nats-io/nats-server/releases/download/v2.10.28/\
nats-server-v2.10.28-${ARCH}.deb && \
dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb
ENV ETCD_VERSION="v3.5.21"
RUN case "$(uname -m)" in \
x86_64) ARCH=amd64 ;; \
aarch64) ARCH=arm64 ;; \
*) echo "Unsupported architecture: $(uname -m)" && exit 1 ;; \
esac && \
wget --tries=3 --waitretry=5 \
https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/\
etcd-${ETCD_VERSION}-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \
tar -xzf /tmp/etcd.tar.gz \
-C /usr/local/bin/etcd --strip-components=1 && \
rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd:$PATH
# Enable forceful shutdown of inflight requests
ENV SGL_FORCE_SHUTDOWN=1
WORKDIR /sgl-workspace/dynamo/examples/backends/sglang
...@@ -115,13 +115,13 @@ VLLM_RUNTIME_IMAGE_TAG_CU13="13.0.2-runtime-ubuntu24.04" ...@@ -115,13 +115,13 @@ VLLM_RUNTIME_IMAGE_TAG_CU13="13.0.2-runtime-ubuntu24.04"
NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
SGLANG_CUDA_VERSION="12.9.1"
# This is for Dockerfile
SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" SGLANG_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
# This is for Dockerfile.sglang. Unlike the other frameworks, it is using a different base image SGLANG_CUDA_VERSION="12.9.1"
SGLANG_FRAMEWORK_IMAGE="nvcr.io/nvidia/cuda" SGLANG_PYTHON_VERSION="3.10"
SGLANG_FRAMEWORK_IMAGE_TAG="${SGLANG_CUDA_VERSION}-cudnn-devel-ubuntu24.04"
PYTHON_VERSION="3.12"
NIXL_REF=0.8.0 NIXL_REF=0.8.0
NIXL_UCX_REF=v1.20.0-rc1 NIXL_UCX_REF=v1.20.0-rc1
...@@ -909,13 +909,10 @@ fi ...@@ -909,13 +909,10 @@ fi
if [[ $FRAMEWORK == "SGLANG" ]]; then if [[ $FRAMEWORK == "SGLANG" ]]; then
echo "Customizing Python, CUDA, and framework images for sglang images" echo "Customizing Python, CUDA, and framework images for sglang images"
BUILD_ARGS+=" --build-arg PYTHON_VERSION=3.10" BUILD_ARGS+=" --build-arg PYTHON_VERSION=${SGLANG_PYTHON_VERSION}"
BUILD_ARGS+=" --build-arg CUDA_VERSION=${SGLANG_CUDA_VERSION}" BUILD_ARGS+=" --build-arg CUDA_VERSION=${SGLANG_CUDA_VERSION}"
# Unlike the other two frameworks, SGLang's framework image is different from the base image, so we need to set it explicitly.
BUILD_ARGS+=" --build-arg FRAMEWORK_IMAGE=${SGLANG_FRAMEWORK_IMAGE}"
BUILD_ARGS+=" --build-arg FRAMEWORK_IMAGE_TAG=${SGLANG_FRAMEWORK_IMAGE_TAG}"
else else
BUILD_ARGS+=" --build-arg PYTHON_VERSION=3.12" BUILD_ARGS+=" --build-arg PYTHON_VERSION=${PYTHON_VERSION}"
fi fi
# Add sccache build arguments # Add sccache build arguments
if [ "$USE_SCCACHE" = true ]; then if [ "$USE_SCCACHE" = true ]; then
......
...@@ -19,7 +19,7 @@ av==15.0.0 ...@@ -19,7 +19,7 @@ av==15.0.0
fastapi==0.120.1 fastapi==0.120.1
ftfy==6.3.1 ftfy==6.3.1
genai-perf==0.0.15 genai-perf==0.0.15
grpcio-tools<=1.66.0 # May have platform-specific builds grpcio-tools<=1.76.0 # May have platform-specific builds
httpx==0.28.1 httpx==0.28.1
kr8s==0.20.13 kr8s==0.20.13
kubernetes==32.0.1 kubernetes==32.0.1
...@@ -38,7 +38,7 @@ pre-commit==4.5.0 ...@@ -38,7 +38,7 @@ pre-commit==4.5.0
prometheus-api-client==0.6.0 prometheus-api-client==0.6.0
prometheus_client==0.23.1 prometheus_client==0.23.1
prophet==1.2.1 prophet==1.2.1
protobuf==5.29.5 protobuf>=5.29.5,<7.0.0
pydantic>=2.11.4,<2.13 # vllm==0.12.0 depends on pydantic>=2.12.0 pydantic>=2.11.4,<2.13 # vllm==0.12.0 depends on pydantic>=2.12.0
pyright==1.1.407 pyright==1.1.407
PyYAML==6.0.3 PyYAML==6.0.3
...@@ -48,12 +48,12 @@ sentencepiece==0.2.1 ...@@ -48,12 +48,12 @@ sentencepiece==0.2.1
# Required by kr8s # Required by kr8s
# https://github.com/kr8s-org/kr8s/blob/750022c3ebbb7988cddb5a979aca2ee8074a1069/examples/kubectl-ng/uv.lock#L988 # https://github.com/kr8s-org/kr8s/blob/750022c3ebbb7988cddb5a979aca2ee8074a1069/examples/kubectl-ng/uv.lock#L988
sniffio==1.3.1 sniffio==1.3.1
tensorboard==2.19.0 tensorboard>=2.19.0,<2.21.0
tensorboardX==2.6.2.2 tensorboardX==2.6.2.2
# Transformers version constraint for container builds # Transformers version constraint for container builds
# - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
# - TensorRT-LLM 1.2.0rc5: ==4.56.0 # - TensorRT-LLM 1.2.0rc5: ==4.56.0
# - SGLang 0.5.6: ==4.57.1 # - SGLang 0.5.6.post2: ==4.57.1
# Using >=4.56.0 and <=4.57.1 to satisfy all frameworks # Using >=4.56.0 and <=4.57.1 to satisfy all frameworks
transformers>=4.56.0,<=4.57.1 transformers>=4.56.0,<=4.57.1
types-aiofiles==25.1.0.20251011 types-aiofiles==25.1.0.20251011
......
...@@ -62,7 +62,7 @@ The following table shows the dependency versions included with each Dynamo rele ...@@ -62,7 +62,7 @@ The following table shows the dependency versions included with each Dynamo rele
| **Dependency** | **main (ToT)** | **v0.8.0 (unreleased)** | **v0.7.1** | **v0.7.0.post1** | **v0.7.0** | | **Dependency** | **main (ToT)** | **v0.8.0 (unreleased)** | **v0.7.1** | **v0.7.0.post1** | **v0.7.0** |
| :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- | | :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- |
| SGLang | 0.5.6 | 0.5.6.post1 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4| | SGLang | 0.5.6.post2 | 0.5.6.post2 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4|
| TensorRT-LLM | 1.2.0rc5 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 | | TensorRT-LLM | 1.2.0rc5 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 |
| vLLM | 0.12.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 | | vLLM | 0.12.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 |
| NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | | NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 |
......
...@@ -61,8 +61,8 @@ vllm = [ ...@@ -61,8 +61,8 @@ vllm = [
sglang = [ sglang = [
"uvloop", "uvloop",
"sglang==0.5.6.post2",
"nixl[cu12]<=0.8.0", "nixl[cu12]<=0.8.0",
"sglang==0.5.6",
] ]
[project.entry-points.pytest11] [project.entry-points.pytest11]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment