build(sglang): slim runtime image (#7850)

2075eb67 · Alec · GitHub · 5bd30719 · 2075eb67 · 2075eb67
Unverified Commit 2075eb67 authored Apr 03, 2026 by Alec Committed by GitHub Apr 03, 2026
4 changed files
--- a/components/src/dynamo/common/multimodal/embedding_transfer.py
+++ b/components/src/dynamo/common/multimodal/embedding_transfer.py
@@ -13,7 +13,7 @@ from abc import ABC, abstractmethod
 from queue import Queue
 from typing import Any, Awaitable, List, Optional
-import msgpack
+import msgspec
 import torch
 from nixl._api import nixl_agent, nixl_agent_config
 from pydantic import BaseModel
@@ -522,7 +522,7 @@ class NixlWriteEmbeddingSender(AbstractEmbeddingSender):
                    (target_buffer, target_byte_size, target_device_id, target_mem_str),
                    write_done_id,
                    remote_agent_metadata,
-                ) = msgpack.unpackb(notif)
+                ) = msgspec.msgpack.decode(notif)
                write_requests.append(
                    (
                        # receiver contact
@@ -703,7 +703,7 @@ class NixlWriteEmbeddingReceiver(AbstractEmbeddingReceiver):
        # Request for transfer
        tensor_id = self.id_counter.get_next_id()
-        notif_msg = msgpack.packb(
+        notif_msg = msgspec.msgpack.encode(
            (
                nixl_request.tensor_id,
                (

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -66,6 +66,7 @@ vllm:
  flashinf_ref: v0.6.6
  lmcache_ref: 0.4.2
  vllm_omni_ref: "v0.18.0"
+  nixl_ref: 0.10.1
  max_jobs: "10"
  enable_media_ffmpeg: "false"
  enable_gpu_memory_service: "true"
@@ -84,6 +85,7 @@ sglang:
    runtime_image: lmsysorg/sglang
    base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
    runtime_image_tag: v0.5.9-cu130-runtime
+  nixl_ref: 0.10.0
  enable_media_ffmpeg: "false"
  enable_gpu_memory_service: "true"
  enable_kvbm: "false"
@@ -94,6 +96,7 @@ trtllm:
    runtime_image: nvcr.io/nvidia/cuda-dl-base
    base_image_tag: 25.12-py3
    runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04
+  nixl_ref: 0.10.1
  enable_media_ffmpeg: "false"
  enable_gpu_memory_service: "false"
  enable_kvbm: "true"

--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -66,7 +66,7 @@ ARG SCCACHE_REGION=""
 # NIXL configuration
 ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }}
-ARG NIXL_REF={{ context.dynamo.nixl_ref }}
+ARG NIXL_REF={{ context[framework].nixl_ref }}
 {% if device == "cuda" %}
 ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }}
 ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }}

--- a/container/templates/sglang_runtime.Dockerfile
+++ b/container/templates/sglang_runtime.Dockerfile
@@ -9,16 +9,6 @@
 FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
-# NOTE: Unlike vLLM/TRTLLM, the SGLang upstream runtime image already ships with the full CUDA
-# toolkit (nvcc, nvlink, ptxas, etc.), so no selective COPY of CUDA binaries is needed here.
-# cleanup unnecessary libs (python3-blinker conflicts with pip-installed blinker from Flask/dash)
-RUN apt remove -y python3-apt python3-blinker && \
-    pip uninstall -y termplotlib
-# This ARG is still utilized for SGLANG Version extraction
-ARG RUNTIME_IMAGE_TAG
-ARG TARGETARCH
 WORKDIR /workspace
 # Install NATS and ETCD
@@ -39,18 +29,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
    # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
    && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
-# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    apt-get update && \
-    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        # required for verification of GPG keys
-        gnupg2 \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-# Copy attribution files
-COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
 {% if context.sglang.enable_media_ffmpeg == "true" %}
 # Copy ffmpeg
 RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
@@ -61,49 +39,16 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
 {% endif %}
-# Copy wheels first (separate from benchmarks to avoid unnecessary cache invalidation)
+{% if target not in ("dev", "local-dev") %}
+# Runtime target installs the prebuilt Dynamo wheels. Dev/local-dev build from
+# source later in the shared dev stage after the workspace is bind-mounted.
 COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
-COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
-COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
-# NIXL environment and native libraries
-ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
-ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
-ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
-# Copy UCX and NIXL native libraries to system directories
-COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
-COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
-ENV PATH=/usr/local/ucx/bin:$PATH
-ENV LD_LIBRARY_PATH=\
-$NIXL_LIB_DIR:\
-$NIXL_PLUGIN_DIR:\
-/usr/local/ucx/lib:\
-/usr/local/ucx/lib/ucx:\
-$LD_LIBRARY_PATH
-ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
-{% if target not in ("dev", "local-dev") %}
-# Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
 RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
    export PIP_CACHE_DIR=/root/.cache/pip && \
-    pip install --break-system-packages \
+    pip install --break-system-packages --no-deps \
        /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
-        /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
+        /opt/dynamo/wheelhouse/ai_dynamo*any.whl
-        /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-        sglang==${SGLANG_VERSION}
-{% else %}
-# Dev/local-dev: skip dynamo wheel install (users build from source via cargo build + maturin develop).
-# Install NIXL wheel (pre-built C++ binary, not buildable from source) and sglang.
-RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
-    export PIP_CACHE_DIR=/root/.cache/pip && \
-    pip install --break-system-packages \
-        /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-        sglang==${SGLANG_VERSION}
-{% endif %}
 # Install gpu_memory_service wheel if enabled (all targets)
 ARG ENABLE_GPU_MEMORY_SERVICE
@@ -113,60 +58,9 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
        if [ -n "$GMS_WHEEL" ]; then pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; fi; \
    fi
-{% if target not in ("dev", "local-dev") %}
-# Copy benchmarks after wheel install so benchmarks changes don't invalidate the layer above
-# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
-COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
-{% endif %}
-# Install runtime dependencies (common + benchmarks) as root.
-# Test and dev dependencies are NOT installed here — they go in the test and dev images.
-RUN --mount=type=bind,source=container/deps/requirements.common.txt,target=/tmp/deps/requirements.common.txt \
-    --mount=type=bind,source=container/deps/requirements.benchmark.txt,target=/tmp/deps/requirements.benchmark.txt \
-    --mount=type=cache,target=/root/.cache/pip,sharing=locked \
-    export PIP_CACHE_DIR=/root/.cache/pip && \
-    pip install --break-system-packages \
-        --requirement /tmp/deps/requirements.common.txt \
-        --requirement /tmp/deps/requirements.benchmark.txt \
-        sglang==${SGLANG_VERSION} && \
-    #TODO: Temporary change until upstream sglang runtime image is updated
-    pip install --break-system-packages "urllib3>=2.6.3"
-{% if target not in ("dev", "local-dev") %}
-# Install benchmarks and fix permissions (dev/local-dev install from bind-mounted source if needed)
-RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
-    export PIP_CACHE_DIR=/root/.cache/pip && \
-    cd /workspace/benchmarks && \
-    pip install --break-system-packages . && \
-    chmod -R g+w /workspace/benchmarks
 {% endif %}
-# Force-reinstall NVIDIA packages in a separate layer so requirements changes don't trigger re-download
+# Copy tests, deploy and components for CI with correct ownership
-RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
-    export PIP_CACHE_DIR=/root/.cache/pip && \
-    CUDA_MAJOR=$(nvcc --version | egrep -o 'cuda_[0-9]+' | cut -d_ -f2) && \
-    if [ "$CUDA_MAJOR" = "12" ]; then \
-        # Install NVIDIA packages that are needed for DeepEP to work properly
-        # This is done in the upstream runtime image too, but these packages are overridden in earlier commands
-        pip install --break-system-packages --force-reinstall --no-deps \
-            nvidia-nccl-cu12==2.28.3 \
-            nvidia-cudnn-cu12==9.16.0.29 \
-            nvidia-cutlass-dsl==4.3.5; \
-    elif [ "$CUDA_MAJOR" = "13" ]; then \
-        # CUDA 13: Install CuDNN for PyTorch 2.9.1 compatibility
-        pip install --break-system-packages --force-reinstall --no-deps \
-            nvidia-nccl-cu13==2.28.3 \
-            nvidia-cublas==13.1.0.3 \
-            nvidia-cutlass-dsl==4.3.1 \
-            nvidia-cudnn-cu13==9.16.0.29; \
-    fi
-# Switch back to dynamo user after package installations
-USER dynamo
-# Copy tests, deploy, and the sglang/common/mocker component subtrees for CI.
-# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
 COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
 COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
 COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
@@ -174,6 +68,7 @@ COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/common /workspace/compon
 COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/sglang /workspace/components/src/dynamo/sglang
 COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/mocker /workspace/components/src/dynamo/mocker
 COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/
+COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
 # Enable forceful shutdown of inflight requests
 ENV SGLANG_FORCE_SHUTDOWN=1
@@ -182,12 +77,6 @@ ENV SGLANG_FORCE_SHUTDOWN=1
 RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
    sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen
-# Our scripting assumes /workspace is where dynamo is located
-# In order to maintain the ability to have sglang and dynamo
-# in the same workspace, symlink /workspace to /sgl-workspace/dynamo
-USER root
-# Fix directory permissions: COPY --chmod only affects contents, not the directory itself
 RUN chmod 755 /opt/dynamo/.launch_screen && \
    echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc && \
    ln -s /workspace /sgl-workspace/dynamo