Unverified Commit 2075eb67 authored by Alec's avatar Alec Committed by GitHub
Browse files

build(sglang): slim runtime image (#7850)

parent 5bd30719
...@@ -13,7 +13,7 @@ from abc import ABC, abstractmethod ...@@ -13,7 +13,7 @@ from abc import ABC, abstractmethod
from queue import Queue from queue import Queue
from typing import Any, Awaitable, List, Optional from typing import Any, Awaitable, List, Optional
import msgpack import msgspec
import torch import torch
from nixl._api import nixl_agent, nixl_agent_config from nixl._api import nixl_agent, nixl_agent_config
from pydantic import BaseModel from pydantic import BaseModel
...@@ -522,7 +522,7 @@ class NixlWriteEmbeddingSender(AbstractEmbeddingSender): ...@@ -522,7 +522,7 @@ class NixlWriteEmbeddingSender(AbstractEmbeddingSender):
(target_buffer, target_byte_size, target_device_id, target_mem_str), (target_buffer, target_byte_size, target_device_id, target_mem_str),
write_done_id, write_done_id,
remote_agent_metadata, remote_agent_metadata,
) = msgpack.unpackb(notif) ) = msgspec.msgpack.decode(notif)
write_requests.append( write_requests.append(
( (
# receiver contact # receiver contact
...@@ -703,7 +703,7 @@ class NixlWriteEmbeddingReceiver(AbstractEmbeddingReceiver): ...@@ -703,7 +703,7 @@ class NixlWriteEmbeddingReceiver(AbstractEmbeddingReceiver):
# Request for transfer # Request for transfer
tensor_id = self.id_counter.get_next_id() tensor_id = self.id_counter.get_next_id()
notif_msg = msgpack.packb( notif_msg = msgspec.msgpack.encode(
( (
nixl_request.tensor_id, nixl_request.tensor_id,
( (
......
...@@ -66,6 +66,7 @@ vllm: ...@@ -66,6 +66,7 @@ vllm:
flashinf_ref: v0.6.6 flashinf_ref: v0.6.6
lmcache_ref: 0.4.2 lmcache_ref: 0.4.2
vllm_omni_ref: "v0.18.0" vllm_omni_ref: "v0.18.0"
nixl_ref: 0.10.1
max_jobs: "10" max_jobs: "10"
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true" enable_gpu_memory_service: "true"
...@@ -84,6 +85,7 @@ sglang: ...@@ -84,6 +85,7 @@ sglang:
runtime_image: lmsysorg/sglang runtime_image: lmsysorg/sglang
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: v0.5.9-cu130-runtime runtime_image_tag: v0.5.9-cu130-runtime
nixl_ref: 0.10.0
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true" enable_gpu_memory_service: "true"
enable_kvbm: "false" enable_kvbm: "false"
...@@ -94,6 +96,7 @@ trtllm: ...@@ -94,6 +96,7 @@ trtllm:
runtime_image: nvcr.io/nvidia/cuda-dl-base runtime_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.12-py3 base_image_tag: 25.12-py3
runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04 runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04
nixl_ref: 0.10.1
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
enable_gpu_memory_service: "false" enable_gpu_memory_service: "false"
enable_kvbm: "true" enable_kvbm: "true"
......
...@@ -66,7 +66,7 @@ ARG SCCACHE_REGION="" ...@@ -66,7 +66,7 @@ ARG SCCACHE_REGION=""
# NIXL configuration # NIXL configuration
ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }} ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }}
ARG NIXL_REF={{ context.dynamo.nixl_ref }} ARG NIXL_REF={{ context[framework].nixl_ref }}
{% if device == "cuda" %} {% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }} ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }}
ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }} ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }}
......
...@@ -9,16 +9,6 @@ ...@@ -9,16 +9,6 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
# NOTE: Unlike vLLM/TRTLLM, the SGLang upstream runtime image already ships with the full CUDA
# toolkit (nvcc, nvlink, ptxas, etc.), so no selective COPY of CUDA binaries is needed here.
# cleanup unnecessary libs (python3-blinker conflicts with pip-installed blinker from Flask/dash)
RUN apt remove -y python3-apt python3-blinker && \
pip uninstall -y termplotlib
# This ARG is still utilized for SGLANG Version extraction
ARG RUNTIME_IMAGE_TAG
ARG TARGETARCH
WORKDIR /workspace WORKDIR /workspace
# Install NATS and ETCD # Install NATS and ETCD
...@@ -39,18 +29,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \ ...@@ -39,18 +29,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
&& mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# required for verification of GPG keys
gnupg2 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Copy attribution files
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
{% if context.sglang.enable_media_ffmpeg == "true" %} {% if context.sglang.enable_media_ffmpeg == "true" %}
# Copy ffmpeg # Copy ffmpeg
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
...@@ -61,49 +39,16 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca ...@@ -61,49 +39,16 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
{% endif %} {% endif %}
# Copy wheels first (separate from benchmarks to avoid unnecessary cache invalidation) {% if target not in ("dev", "local-dev") %}
# Runtime target installs the prebuilt Dynamo wheels. Dev/local-dev build from
# source later in the shared dev stage after the workspace is bind-mounted.
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
# NIXL environment and native libraries
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Copy UCX and NIXL native libraries to system directories
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
{% if target not in ("dev", "local-dev") %}
# Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \ export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \ pip install --break-system-packages --no-deps \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION}
{% else %}
# Dev/local-dev: skip dynamo wheel install (users build from source via cargo build + maturin develop).
# Install NIXL wheel (pre-built C++ binary, not buildable from source) and sglang.
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION}
{% endif %}
# Install gpu_memory_service wheel if enabled (all targets) # Install gpu_memory_service wheel if enabled (all targets)
ARG ENABLE_GPU_MEMORY_SERVICE ARG ENABLE_GPU_MEMORY_SERVICE
...@@ -113,60 +58,9 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ ...@@ -113,60 +58,9 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \ GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -n "$GMS_WHEEL" ]; then pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; fi; \ if [ -n "$GMS_WHEEL" ]; then pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; fi; \
fi fi
{% if target not in ("dev", "local-dev") %}
# Copy benchmarks after wheel install so benchmarks changes don't invalidate the layer above
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
{% endif %}
# Install runtime dependencies (common + benchmarks) as root.
# Test and dev dependencies are NOT installed here — they go in the test and dev images.
RUN --mount=type=bind,source=container/deps/requirements.common.txt,target=/tmp/deps/requirements.common.txt \
--mount=type=bind,source=container/deps/requirements.benchmark.txt,target=/tmp/deps/requirements.benchmark.txt \
--mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \
--requirement /tmp/deps/requirements.common.txt \
--requirement /tmp/deps/requirements.benchmark.txt \
sglang==${SGLANG_VERSION} && \
#TODO: Temporary change until upstream sglang runtime image is updated
pip install --break-system-packages "urllib3>=2.6.3"
{% if target not in ("dev", "local-dev") %}
# Install benchmarks and fix permissions (dev/local-dev install from bind-mounted source if needed)
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
cd /workspace/benchmarks && \
pip install --break-system-packages . && \
chmod -R g+w /workspace/benchmarks
{% endif %} {% endif %}
# Force-reinstall NVIDIA packages in a separate layer so requirements changes don't trigger re-download # Copy tests, deploy and components for CI with correct ownership
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \
CUDA_MAJOR=$(nvcc --version | egrep -o 'cuda_[0-9]+' | cut -d_ -f2) && \
if [ "$CUDA_MAJOR" = "12" ]; then \
# Install NVIDIA packages that are needed for DeepEP to work properly
# This is done in the upstream runtime image too, but these packages are overridden in earlier commands
pip install --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu12==2.28.3 \
nvidia-cudnn-cu12==9.16.0.29 \
nvidia-cutlass-dsl==4.3.5; \
elif [ "$CUDA_MAJOR" = "13" ]; then \
# CUDA 13: Install CuDNN for PyTorch 2.9.1 compatibility
pip install --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu13==2.28.3 \
nvidia-cublas==13.1.0.3 \
nvidia-cutlass-dsl==4.3.1 \
nvidia-cudnn-cu13==9.16.0.29; \
fi
# Switch back to dynamo user after package installations
USER dynamo
# Copy tests, deploy, and the sglang/common/mocker component subtrees for CI.
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
...@@ -174,6 +68,7 @@ COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/common /workspace/compon ...@@ -174,6 +68,7 @@ COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/common /workspace/compon
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/sglang /workspace/components/src/dynamo/sglang COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/sglang /workspace/components/src/dynamo/sglang
COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/mocker /workspace/components/src/dynamo/mocker COPY --chmod=775 --chown=dynamo:0 components/src/dynamo/mocker /workspace/components/src/dynamo/mocker
COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/ COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
# Enable forceful shutdown of inflight requests # Enable forceful shutdown of inflight requests
ENV SGLANG_FORCE_SHUTDOWN=1 ENV SGLANG_FORCE_SHUTDOWN=1
...@@ -182,12 +77,6 @@ ENV SGLANG_FORCE_SHUTDOWN=1 ...@@ -182,12 +77,6 @@ ENV SGLANG_FORCE_SHUTDOWN=1
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \ RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen
# Our scripting assumes /workspace is where dynamo is located
# In order to maintain the ability to have sglang and dynamo
# in the same workspace, symlink /workspace to /sgl-workspace/dynamo
USER root
# Fix directory permissions: COPY --chmod only affects contents, not the directory itself
RUN chmod 755 /opt/dynamo/.launch_screen && \ RUN chmod 755 /opt/dynamo/.launch_screen && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc && \ echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc && \
ln -s /workspace /sgl-workspace/dynamo ln -s /workspace /sgl-workspace/dynamo
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment