Unverified Commit 2075eb67 authored by Alec's avatar Alec Committed by GitHub
Browse files

build(sglang): slim runtime image (#7850)

parent 5bd30719
......@@ -13,7 +13,7 @@ from abc import ABC, abstractmethod
from queue import Queue
from typing import Any, Awaitable, List, Optional
import msgpack
import msgspec
import torch
from nixl._api import nixl_agent, nixl_agent_config
from pydantic import BaseModel
......@@ -522,7 +522,7 @@ class NixlWriteEmbeddingSender(AbstractEmbeddingSender):
(target_buffer, target_byte_size, target_device_id, target_mem_str),
write_done_id,
remote_agent_metadata,
) = msgpack.unpackb(notif)
) = msgspec.msgpack.decode(notif)
write_requests.append(
(
# receiver contact
......@@ -703,7 +703,7 @@ class NixlWriteEmbeddingReceiver(AbstractEmbeddingReceiver):
# Request for transfer
tensor_id = self.id_counter.get_next_id()
notif_msg = msgpack.packb(
notif_msg = msgspec.msgpack.encode(
(
nixl_request.tensor_id,
(
......
......@@ -66,6 +66,7 @@ vllm:
flashinf_ref: v0.6.6
lmcache_ref: 0.4.2
vllm_omni_ref: "v0.18.0"
nixl_ref: 0.10.1
max_jobs: "10"
enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true"
......@@ -84,6 +85,7 @@ sglang:
runtime_image: lmsysorg/sglang
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: v0.5.9-cu130-runtime
nixl_ref: 0.10.0
enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true"
enable_kvbm: "false"
......@@ -94,6 +96,7 @@ trtllm:
runtime_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.12-py3
runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04
nixl_ref: 0.10.1
enable_media_ffmpeg: "false"
enable_gpu_memory_service: "false"
enable_kvbm: "true"
......
......@@ -66,7 +66,7 @@ ARG SCCACHE_REGION=""
# NIXL configuration
ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }}
ARG NIXL_REF={{ context.dynamo.nixl_ref }}
ARG NIXL_REF={{ context[framework].nixl_ref }}
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }}
ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }}
......
......@@ -9,16 +9,6 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
# NOTE: Unlike vLLM/TRTLLM, the SGLang upstream runtime image already ships with the full CUDA
# toolkit (nvcc, nvlink, ptxas, etc.), so no selective COPY of CUDA binaries is needed here.
# cleanup unnecessary libs (python3-blinker conflicts with pip-installed blinker from Flask/dash)
RUN apt remove -y python3-apt python3-blinker && \
pip uninstall -y termplotlib
# This ARG is still utilized for SGLANG Version extraction
ARG RUNTIME_IMAGE_TAG
ARG TARGETARCH
WORKDIR /workspace
# Install NATS and ETCD
......@@ -39,18 +29,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
&& mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# required for verification of GPG keys
gnupg2 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Copy attribution files
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
{% if context.sglang.enable_media_ffmpeg == "true" %}
# Copy ffmpeg
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
......@@ -61,49 +39,16 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
{% endif %}
# Copy wheels first (separate from benchmarks to avoid unnecessary cache invalidation)
{% if target not in ("dev", "local-dev") %}
# Runtime target installs the prebuilt Dynamo wheels. Dev/local-dev build from
# source later in the shared dev stage after the workspace is bind-mounted.
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
# NIXL environment and native libraries
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Copy UCX and NIXL native libraries to system directories
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
{% if target not in ("dev", "local-dev") %}
# Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \
pip install --break-system-packages --no-deps \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION}
{% else %}
# Dev/local-dev: skip dynamo wheel install (users build from source via cargo build + maturin develop).
# Install NIXL wheel (pre-built C++ binary, not buildable from source) and sglang.
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION}
{% endif %}
/opt/dynamo/wheelhouse/ai_dynamo*any.whl
# Install gpu_memory_service wheel if enabled (all targets)
ARG ENABLE_GPU_MEMORY_SERVICE
......@@ -113,60 +58,9 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -n "$GMS_WHEEL" ]; then pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; fi; \
fi
{% if target not in ("dev", "local-dev") %}
# Copy benchmarks after wheel install so benchmarks changes don't invalidate the layer above
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
{% endif %}
# Install runtime dependencies (common + benchmarks) as root.
# Test and dev dependencies are NOT installed here — they go in the test and dev images.
RUN --mount=type=bind,source=container/deps/requirements.common.txt,target=/tmp/deps/requirements.common.txt \
--mount=type=bind,source=container/deps/requirements.benchmark.txt,target=/tmp/deps/requirements.benchmark.txt \
--mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \
--requirement /tmp/deps/requirements.common.txt \
--requirement /tmp/deps/requirements.benchmark.txt \
sglang==${SGLANG_VERSION} && \
#TODO: Temporary change until upstream sglang runtime image is updated
pip install --break-system-packages "urllib3>=2.6.3"
{% if target not in ("dev", "local-dev") %}
# Install benchmarks and fix permissions (dev/local-dev install from bind-mounted source if needed)
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
cd /workspace/benchmarks && \
pip install --break-system-packages . && \
chmod -R g+w /workspace/benchmarks
{% endif %}
# Force-reinstall NVIDIA packages in a separate layer so requirements changes don't trigger re-download
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
CUDA_MAJOR=$(nvcc --version | egrep -o 'cuda_[0-9]+' | cut -d_ -f2) && \
if [ "$CUDA_MAJOR" = "12" ]; then \
# Install NVIDIA packages that are needed for DeepEP to work properly
# This is done in the upstream runtime image too, but these packages are overridden in earlier commands
pip install --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu12==2.28.3 \
nvidia-cudnn-cu12==9.16.0.29 \
nvidia-cutlass-dsl==4.3.5; \
elif [ "$CUDA_MAJOR" = "13" ]; then \
# CUDA 13: Install CuDNN for PyTorch 2.9.1 compatibility
pip install --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu13==2.28.3 \
nvidia-cublas==13.1.0.3 \
nvidia-cutlass-dsl==4.3.1 \
nvidia-cudnn-cu13==9.16.0.29; \
fi
# Switch back to dynamo user after package installations
USER dynamo
# Copy tests, deploy, and the sglang/common/mocker component subtrees for CI.
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
# Copy tests, deploy and components for CI with correct ownership
COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
......@@ -174,6 +68,7 @@ COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/common /workspace/compon
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/sglang /workspace/components/src/dynamo/sglang
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/mocker /workspace/components/src/dynamo/mocker
COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
# Enable forceful shutdown of inflight requests
ENV SGLANG_FORCE_SHUTDOWN=1
......@@ -182,12 +77,6 @@ ENV SGLANG_FORCE_SHUTDOWN=1
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen
# Our scripting assumes /workspace is where dynamo is located
# In order to maintain the ability to have sglang and dynamo
# in the same workspace, symlink /workspace to /sgl-workspace/dynamo
USER root
# Fix directory permissions: COPY --chmod only affects contents, not the directory itself
RUN chmod 755 /opt/dynamo/.launch_screen && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc && \
ln -s /workspace /sgl-workspace/dynamo
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment