wheel_builder.Dockerfile 23.3 KB
Newer Older
1
2
{#
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4
#}
5
# === BEGIN templates/wheel_builder.Dockerfile ===
6
7
8
9
##################################
##### Wheel Build Image ##########
##################################

10
11
12
13
14
15
16
17
18
{% if platform == "multi" and device == "cuda" %}
# Multi-arch: declare both manylinux base images with explicit --platform so each is
# always pulled as the correct native arch regardless of the current TARGETPLATFORM.
# BuildKit only fetches and builds the stage that TARGETARCH resolves to; the other
# is a no-op for each sub-build.
FROM --platform=linux/amd64 quay.io/pypa/manylinux_2_28_x86_64 AS manylinux_amd64
FROM --platform=linux/arm64 quay.io/pypa/manylinux_2_28_aarch64 AS manylinux_arm64
{% endif %}

19
20
21
22
23
##################################
##### wheel_builder_base #########
##################################
# Shared base for all wheel builds: tools, system deps, and native libraries (except nixl).

24
25
26
{% if platform == "multi" and device == "cuda" %}
FROM manylinux_${TARGETARCH} AS wheel_builder_base
{% else %}
27
FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder_base
28
{% endif %}
29
30

# Redeclare ARGs for this stage
31
ARG TARGETARCH
32
ARG CARGO_BUILD_JOBS
33
ARG DEVICE
34
35

WORKDIR /workspace
36
37
38
39
40
41
42
43
44
45
46
{% if device == "xpu" or device == "cpu" %}
RUN apt clean && apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
    curl ca-certificates zip unzip git lsb-release numactl wget vim \
    libsndfile1 \
    libsm6 \
    libxext6 \
    libgl1 \
    libaio-dev \
    linux-libc-dev
{% endif %}
47

48
{% if device == "cuda" %}
49
50
51
# Copy CUDA from base stage
COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda
COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf
52
{% endif %}
53
54
55
56
57
58
59
60

# Set environment variables first so they can be used in COPY commands
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
    RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/opt/dynamo/target \
    PATH=/usr/local/cargo/bin:$PATH

61
62


63
64
65
# Copy artifacts from base stage
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
66

67
68
69
70
71
{% if device == "xpu" %}
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
    add-apt-repository -y ppa:kobuk-team/intel-graphics

72
# Fetch UCX patch
73
74
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/ucx-v1.12.0.patch -O /tmp/ucx.patch

75
76
# Install Intel GPU runtime packages
RUN apt update -y && apt upgrade -y && \
77
78
79
    apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd  \
    libze-intel-gpu-raytracing intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
80
{% endif %}
81

82
83
{% if device == "xpu" or device == "cpu" %}
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
RUN apt-get update -y \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # NIXL build dependencies
        autoconf \
        automake \
        cmake \
        git-lfs \
        libtool \
        meson \
        net-tools \
        ninja-build \
        pybind11-dev \
        # Rust build dependencies
        clang \
        libclang-dev \
        protobuf-compiler \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

RUN apt-get update -y \
    && DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall --no-install-recommends \
        libibverbs-dev \
        rdma-core \
        ibverbs-utils \
        libibumad-dev \
        libnuma-dev \
        librdmacm-dev \
        ibverbs-providers \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
{% endif %}

{% if device == "cuda" %}
117
# Install system dependencies
118
119
120
# Cache dnf downloads; sharing=locked avoids dnf/rpm races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/dnf,sharing=locked \
    dnf install -y almalinux-release-synergy && \
121
122
    dnf config-manager --set-enabled powertools && \
    dnf install -y \
123
124
125
126
127
        # Autotools (required for UCX, libfabric ./autogen.sh and ./configure)
        autoconf \
        automake \
        libtool \
        make \
128
        # RPM build tools (required for gdrcopy's build-rpm-packages.sh)
129
130
        rpm-build \
        rpm-sign \
131
132
133
134
        # Build tools
        cmake \
        ninja-build \
        clang-devel \
135
136
137
138
        # Install GCC toolset 14 (CUDA compatible, max version 14)
        gcc-toolset-14-gcc \
        gcc-toolset-14-gcc-c++ \
        gcc-toolset-14-binutils \
139
140
141
142
143
144
145
146
147
148
149
150
151
152
        flex \
        wget \
        # Kernel module build dependencies
        dkms \
        # Protobuf support
        protobuf-compiler \
        # RDMA/InfiniBand support (required for UCX build with --with-verbs)
        libibverbs \
        libibverbs-devel \
        rdma-core \
        rdma-core-devel \
        libibumad \
        libibumad-devel \
        librdmacm-devel \
Joe Chandler's avatar
Joe Chandler committed
153
154
155
        numactl-devel \
        # Libfabric support
        hwloc \
156
157
158
159
        hwloc-devel \
        libcurl-devel \
        openssl-devel \
        libuuid-devel \
160
        zlib-devel
161

162
163
164
165
166
# Set GCC toolset 14 as the default compiler (CUDA requires GCC <= 14)
ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
    LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \
    CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \
    CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++"
167
{% endif %}
168

169
170
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \
171
    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
    PROTOC_VERSION=25.3; \
    case "${ARCH_ALT}" in \
      x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \
      aarch64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-aarch_64.zip" ;; \
      *) echo "Unsupported architecture: ${ARCH_ALT}" >&2; exit 1 ;; \
    esac; \
    wget --tries=3 --waitretry=5 -O /tmp/protoc.zip "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/${PROTOC_ZIP}"; \
    rm -f /usr/local/bin/protoc /usr/bin/protoc; \
    unzip -o /tmp/protoc.zip -d /usr/local bin/protoc include/*; \
    chmod +x /usr/local/bin/protoc; \
    ln -s /usr/local/bin/protoc /usr/bin/protoc; \
    protoc --version

# Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc

188
{% if device == "xpu" or device == "cpu" %}
189
190
# Install uv package manager
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
191
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH:-}
192
{% else %}
193
194
ENV CUDA_PATH=/usr/local/cuda \
    PATH=/usr/local/cuda/bin:$PATH \
195
    LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH:-} \
196
    NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
197
{% endif %}
198
199
200
201

# Create virtual environment for building wheels
ARG PYTHON_VERSION
ENV VIRTUAL_ENV=/workspace/.venv
202
203
204
205
# Cache uv downloads; uv handles its own locking for this cache.
RUN --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv UV_HTTP_TIMEOUT=300 UV_HTTP_RETRIES=5 && \
    uv venv ${VIRTUAL_ENV} --python $PYTHON_VERSION && \
206
    uv pip install --upgrade meson pybind11 patchelf maturin[patchelf] tomlkit
207
208

ARG NIXL_UCX_REF
209
210

{% if device == "cuda" %}
211
212
213
ARG NIXL_GDRCOPY_REF

# Build and install gdrcopy
214
215
RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
    git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
216
217
218
219
220
    cd gdrcopy/packages && \
    CUDA=/usr/local/cuda ./build-rpm-packages.sh && \
    rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
    rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \
    rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm
221
{% endif %}
222

223
224
225
226
227
# sccache binary is pre-installed in dynamo_base; stage it off-PATH so
# Meson doesn't auto-detect it as a CUDA compiler launcher
# (https://github.com/mesonbuild/meson/issues/11118).
# When USE_SCCACHE=true the RUN below symlinks it onto PATH before install.
COPY --from=dynamo_base /usr/local/bin/sccache /opt/sccache/sccache
228

229
230
231
232
233
ARG USE_SCCACHE
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
COPY container/use-sccache.sh /tmp/use-sccache.sh
RUN if [ "$USE_SCCACHE" = "true" ]; then \
234
        ln -s /opt/sccache/sccache /usr/local/bin/sccache && \
235
236
237
        /tmp/use-sccache.sh install; \
    fi

238
239
# Set SCCACHE environment variables (RUSTC_WRAPPER is set dynamically by
# setup-env only when the sccache server starts successfully)
240
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
241
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}}
242

243
# Always build FFmpeg so libs are available for Rust checks in CI
milesial's avatar
milesial committed
244
# Do not delete the source tarball for legal reasons
245
ARG FFMPEG_VERSION
246
247
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
248
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
249
    if [ "$USE_SCCACHE" = "true" ]; then \
250
        eval $(/tmp/use-sccache.sh setup-env); \
251
    fi && \
252
253
    if [ "$DEVICE" = "xpu" ] || [ "$DEVICE" = "cpu" ]; then \
    apt-get update -y && apt-get install -y build-essential pkg-config xz-utils; \
254
255
    apt-get clean && rm -rf /var/lib/apt/lists/*; \
    elif [ "$DEVICE" = "cuda" ]; then \
256
    dnf install -y pkg-config xz; \
257
    fi && \
258
    cd /tmp && \
259
    curl --retry 5 --retry-delay 3 -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \
260
261
262
263
    tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \
    cd ffmpeg-${FFMPEG_VERSION} && \
    ./configure \
        --prefix=/usr/local \
milesial's avatar
milesial committed
264
265
        --disable-gpl \
        --disable-nonfree \
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        --disable-programs \
        --disable-doc \
        --disable-static \
        --disable-x86asm \
        --disable-postproc \
        --disable-network \
        --disable-encoders \
        --disable-muxers \
        --disable-bsfs \
        --disable-devices \
        --disable-libdrm \
        --enable-shared && \
    make -j$(nproc) && \
    make install && \
    /tmp/use-sccache.sh show-stats "FFMPEG" && \
    ldconfig && \
milesial's avatar
milesial committed
282
    mkdir -p /usr/local/src/ffmpeg && \
283
    mv /tmp/ffmpeg-${FFMPEG_VERSION}* /usr/local/src/ffmpeg/
284

285
286
287
# Build and install UCX
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
288
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
289
    if [ "$USE_SCCACHE" = "true" ]; then \
290
        eval $(/tmp/use-sccache.sh setup-env); \
291
    fi && \
292
    cd /usr/local/src && \
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
    git clone https://github.com/openucx/ucx.git && \
    cd ucx &&  \
    git checkout $NIXL_UCX_REF &&	 \
    if [ "$DEVICE" = "xpu" ]; then \
    git apply --ignore-whitespace /tmp/ucx.patch; \
    fi && \
    ./autogen.sh &&      \
    if [ "$DEVICE" = "xpu" ]; then \
     ./contrib/configure-release     \
        --prefix=/usr/local/ucx     \
        --with-ze                   \
        --enable-shared             \
        --disable-static            \
        --disable-doxygen-doc       \
        --enable-optimizations      \
        --enable-cma                \
        --enable-devel-headers      \
        --with-verbs                \
        --with-dm                   \
        --with-efa                  \
        --without-cuda              \
        --enable-mt;                 \
    elif [ "$DEVICE" = "cuda" ]; then \
     ./contrib/configure-release     \
317
318
319
320
321
322
323
324
325
326
327
328
        --prefix=/usr/local/ucx     \
        --enable-shared             \
        --disable-static            \
        --disable-doxygen-doc       \
        --enable-optimizations      \
        --enable-cma                \
        --enable-devel-headers      \
        --with-cuda=/usr/local/cuda \
        --with-verbs                \
        --with-dm                   \
        --with-gdrcopy=/usr/local   \
        --with-efa                  \
329
        --enable-mt;                 \
330
331
332
333
334
335
336
337
338
339
340
341
    elif [ "$DEVICE" = "cpu" ]; then  \
     ./contrib/configure-release     \
        --prefix=/usr/local/ucx     \
        --enable-shared             \
        --disable-static            \
        --disable-doxygen-doc       \
        --enable-optimizations      \
        --enable-cma                \
        --enable-devel-headers      \
        --with-verbs                \
        --without-cuda              \
        --enable-mt;                 \
342
     fi && \
343
344
345
346
347
348
349
     make -j &&                      \
     make -j install-strip &&        \
     /tmp/use-sccache.sh show-stats "UCX" && \
     echo "/usr/local/ucx/lib" > /etc/ld.so.conf.d/ucx.conf && \
     echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
     ldconfig

350
{% if device == "cuda" %}
Joe Chandler's avatar
Joe Chandler committed
351
352
353
ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
354
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
Joe Chandler's avatar
Joe Chandler committed
355
    if [ "$USE_SCCACHE" = "true" ]; then \
356
        eval $(/tmp/use-sccache.sh setup-env); \
Joe Chandler's avatar
Joe Chandler committed
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
    fi && \
    cd /usr/local/src && \
    git clone https://github.com/ofiwg/libfabric.git && \
    cd libfabric && \
    git checkout $NIXL_LIBFABRIC_REF && \
    ./autogen.sh && \
    ./configure --prefix="/usr/local/libfabric" \
                --disable-verbs \
                --disable-psm3 \
                --disable-opx \
                --disable-usnic \
                --disable-rstream \
                --enable-efa \
                --with-cuda=/usr/local/cuda \
                --enable-cuda-dlopen \
                --with-gdrcopy \
                --enable-gdrcopy-dlopen && \
    make -j$(nproc) && \
    make install && \
    /tmp/use-sccache.sh show-stats "LIBFABRIC" && \
    echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \
    ldconfig
379
{% endif %}
Joe Chandler's avatar
Joe Chandler committed
380

381
{% if framework == "vllm" and device == "cuda" %}
382
# Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support)
383
ARG AWS_SDK_CPP_VERSION=1.11.760
384
385
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
386
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
387
388
389
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env cmake); \
    fi && \
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
    git clone --recurse-submodules --depth 1 --branch ${AWS_SDK_CPP_VERSION} \
        https://github.com/aws/aws-sdk-cpp.git /tmp/aws-sdk-cpp && \
    mkdir -p /tmp/aws-sdk-cpp/build && \
    cd /tmp/aws-sdk-cpp/build && \
    cmake .. \
        -DCMAKE_BUILD_TYPE=Release \
        -DBUILD_ONLY="s3" \
        -DENABLE_TESTING=OFF \
        -DCMAKE_INSTALL_PREFIX=/usr/local \
        -DBUILD_SHARED_LIBS=ON && \
    make -j$(nproc) && \
    make install && \
    cd / && \
    rm -rf /tmp/aws-sdk-cpp && \
    ldconfig && \
    /tmp/use-sccache.sh show-stats "AWS SDK C++"
{% endif %}

408
409
410
411
412
413
414
415
416
417

##################################
##### runtime_wheel_builder ######
##################################
# Builds ai-dynamo, ai-dynamo-runtime, and gpu_memory_service wheels, sans nixl.

FROM wheel_builder_base AS runtime_wheel_builder

{% if target not in ("dev", "local-dev") %}
# Copy source code (order matters for layer caching)
418
COPY .cargo/ /opt/dynamo/.cargo/
419
420
421
422
423
424
425
426
427
428
429
430
431
COPY pyproject.toml README.md LICENSE Cargo.toml Cargo.lock rust-toolchain.toml hatch_build.py /opt/dynamo/
COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/

# Build ai-dynamo (pure Python) and ai-dynamo-runtime (maturin) wheels
ARG USE_SCCACHE
ARG ENABLE_MEDIA_FFMPEG
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    --mount=type=cache,target=/root/.cargo/registry \
    --mount=type=cache,target=/root/.cargo/git \
    --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv && \
432
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
433
434
435
436
437
438
439
440
441
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env cmake); \
    fi && \
    mkdir -p ${CARGO_TARGET_DIR} && \
    source ${VIRTUAL_ENV}/bin/activate && \
    cd /opt/dynamo && \
    uv build --wheel --out-dir /opt/dynamo/dist && \
    cd /opt/dynamo/lib/bindings/python && \
    if [ "$ENABLE_MEDIA_FFMPEG" = "true" ]; then \
442
        maturin build --release --features "media-ffmpeg,kv-indexer" --out /opt/dynamo/dist; \
443
    else \
444
        maturin build --release --features "kv-indexer" --out /opt/dynamo/dist; \
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo Runtime"

{% else %}
# Dev/local-dev targets do not have pre-built wheels or /workspace source code.
# After you start the local-dev/dev container, you will need to build from source:
#   cargo build --features dynamo-llm/block-manager
#   cd /workspace/lib/bindings/python && maturin develop --uv && cd /workspace
#   uv pip install --no-deps -e /workspace
# See container/launch_message/dev.txt for the full setup steps.

# Create dist dir with a placeholder so downstream COPY --from=wheel_builder /opt/dynamo/dist/*.whl always has a match.
RUN mkdir -p /opt/dynamo/dist ${CARGO_TARGET_DIR} && \
    touch /opt/dynamo/dist/.placeholder.whl

# Dev/local-dev skip the full COPY lib/ above, so copy gpu_memory_service source explicitly for the wheel build below
COPY lib/gpu_memory_service/ /opt/dynamo/lib/gpu_memory_service/
{% endif %}

# Build gpu-memory-service wheel → /opt/dynamo/dist/gpu_memory_service*.whl (small C++ extension, fast build -- all targets, all frameworks)
{% if device == "cuda" %}
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
        export UV_CACHE_DIR=/root/.cache/uv && \
        source ${VIRTUAL_ENV}/bin/activate && \
        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
    fi
{% endif %}


##################################
##### wheel_builder ##############
##################################
# Builds nixl (native + Python wheel) and kvbm wheel, then consolidates all wheels.
# Runtime templates COPY from this stage.

FROM wheel_builder_base AS wheel_builder

# Build and install nixl
486
ARG TARGETARCH
487
488
489
ARG DEVICE
ARG NIXL_REF
ARG USE_SCCACHE
490
{% if device == "cuda" %}
491
ARG CUDA_MAJOR
492
493
{% endif %}

494
495
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
496
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
497
    if [ "$USE_SCCACHE" = "true" ]; then \
498
        eval $(/tmp/use-sccache.sh setup-env); \
499
    fi && \
500
    source ${VIRTUAL_ENV}/bin/activate && \
501
    git clone "https://github.com/ai-dynamo/nixl.git" && \
502
    cd nixl && \
503
    git checkout ${NIXL_REF} && \
504
505
    if [ "$DEVICE" = "cuda" ]; then \
        PKG_NAME="nixl-cu${CUDA_MAJOR}"; \
506
507
    else \
        PKG_NAME="nixl-${DEVICE}"; \
508
    fi && \
509
    ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \
510
    mkdir build && \
511
512
513
514
515
516
517
518
519
    if [ "$DEVICE" = "cuda" ]; then \
        meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
            -Dcudapath_lib="/usr/local/cuda/lib64" \
            -Dcudapath_inc="/usr/local/cuda/include" \
            -Ducx_path="/usr/local/ucx" \
            -Dlibfabric_path="/usr/local/libfabric"; \
    elif [ "$DEVICE" = "xpu" ]; then \
        meson setup build/ --prefix=/opt/intel/intel_nixl --buildtype=release \
            -Ducx_path="/usr/local/ucx"; \
520
521
522
    elif [ "$DEVICE" = "cpu" ]; then \
        meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
            -Ducx_path="/usr/local/ucx"; \
523
    fi && \
524
525
526
527
528
    cd build && \
    ninja && \
    ninja install && \
    /tmp/use-sccache.sh show-stats "NIXL"

529
{% if device == "xpu" %}
530
531
532
{# XPU only supports x86_64; no ARCH_ALT ARG needed #}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu \
    NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu/plugins \
533
    NIXL_PREFIX=/opt/intel/intel_nixl
534
535
536
537
{% elif device == "cpu" %}
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu \
    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu/plugins \
    NIXL_PREFIX=/opt/nvidia/nvda_nixl
538
{% else %}
539
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
540
541
    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
    NIXL_PREFIX=/opt/nvidia/nvda_nixl
542
543
{% endif %}

544
545
546
547
548
549
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}

RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
    echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \
    ldconfig

550
# Build NIXL wheel → /opt/dynamo/dist/nixl/nixl*.whl (C++ transport library, all targets)
551
ARG PYTHON_VERSION
552
553
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
554
555
    --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv && \
556
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
557
    if [ "$USE_SCCACHE" = "true" ]; then \
558
        eval $(/tmp/use-sccache.sh setup-env); \
559
    fi && \
560
    cd /workspace/nixl && \
561
    uv build . --wheel --out-dir /opt/dynamo/dist/nixl --python $PYTHON_VERSION
562

563
{% if target not in ("dev", "local-dev") %}
564
# Copy source code (order matters for layer caching)
565
COPY .cargo/ /opt/dynamo/.cargo/
566
567
568
569
COPY pyproject.toml README.md LICENSE Cargo.toml Cargo.lock rust-toolchain.toml hatch_build.py /opt/dynamo/
COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/

570
# Build kvbm wheel (with nixl linkage via auditwheel repair)
571
572
573
ARG ENABLE_KVBM
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
574
575
576
577
    --mount=type=cache,target=/root/.cargo/registry \
    --mount=type=cache,target=/root/.cargo/git \
    --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv && \
578
579
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
580
    if [ "$USE_SCCACHE" = "true" ]; then \
581
        eval $(/tmp/use-sccache.sh setup-env cmake); \
582
    fi && \
583
    mkdir -p ${CARGO_TARGET_DIR} && \
584
    source ${VIRTUAL_ENV}/bin/activate && \
585
    if [ "$ENABLE_KVBM" = "true" ]; then \
586
        cd /opt/dynamo/lib/bindings/kvbm && \
587
588
589
        KVBM_FEATURES=""; \
        if [ "$DEVICE" = "cuda" ]; then KVBM_FEATURES="--features nccl"; fi && \
        maturin build --release ${KVBM_FEATURES} --out target/wheels && \
590
591
592
593
594
595
596
597
598
        if [ "$DEVICE" = "cuda" ]; then \
            auditwheel repair \
                --exclude libnixl.so \
                --exclude libnixl_build.so \
                --exclude libnixl_common.so \
                --exclude 'lib*.so*' \
                --plat manylinux_2_28_${ARCH_ALT} \
                --wheel-dir /opt/dynamo/dist \
                target/wheels/*.whl; \
599
        elif [ "$DEVICE" = "xpu" ] || [ "$DEVICE" = "cpu" ]; then \
600
601
            cp target/wheels/*.whl /opt/dynamo/dist/; \
        fi; \
602
    fi && \
603
    /tmp/use-sccache.sh show-stats "Dynamo KVBM"
604
605
{% endif %}

606
607
# Consolidate all wheels from the runtime wheel builder stage
COPY --from=runtime_wheel_builder /opt/dynamo/dist/ /opt/dynamo/dist/