"lib/llm/src/entrypoint/input/common.rs" did not exist on "b82e7327a0c96bb9174c9564db02e969860f6afe"
Dockerfile 13.4 KB
Newer Older
1
# syntax=docker/dockerfile:1.10.0
2
3
4
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

5
6
7
8
9
##################################
########## Build Arguments ########
##################################

# Base image configuration
10
11
12
13
14
15
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
16
17

# Build configuration
18
ARG ENABLE_KVBM=false
19
ARG CARGO_BUILD_JOBS
20
21
22
23
24
25
26
27
28
29
30
31
32
33

# Define general architecture ARGs for supporting both x86 and aarch64 builds.
#   ARCH: Used for package suffixes (e.g., amd64, arm64)
#   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
#
# Default values are for x86/amd64:
#   --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
#
# For arm64/aarch64, build with:
#   --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
ARG ARCH=amd64
ARG ARCH_ALT=x86_64

34
35
36
37
38
39
40
# SCCACHE configuration
ARG USE_SCCACHE
ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION=""

# NIXL configuration
ARG NIXL_UCX_REF=v1.19.0
41
ARG NIXL_REF=0.7.0
42
43
44

# Python configuration
ARG PYTHON_VERSION=3.12
45
46
47
48
49
50
51

##################################
########## Base Image ############
##################################

FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base

52
# Redeclare ARGs for this stage
53
54
ARG ARCH
ARG ARCH_ALT
55
56
57
58
59
60
ARG PYTHON_VERSION
ARG USE_SCCACHE
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
ARG NIXL_UCX_REF
ARG NIXL_REF
61
62

USER root
63
64
65
66
67
WORKDIR /opt/dynamo

##################################
########## Tool Installation #####
##################################
68

69
# Install uv package manager
70
71
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Install SCCACHE if requested
COPY container/use-sccache.sh /tmp/use-sccache.sh
RUN if [ "$USE_SCCACHE" = "true" ]; then \
        /tmp/use-sccache.sh install; \
    fi

# Set SCCACHE environment variables
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \
    RUSTC_WRAPPER=${USE_SCCACHE:+sccache} \
    CMAKE_C_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CXX_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CUDA_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache}

##################################
########## Rust Setup ############
##################################

90
91
92
93
# Rust environment setup
ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    PATH=/usr/local/cargo/bin:$PATH \
94
    RUST_VERSION=1.90.0
95
96
97
98

# Define Rust target based on ARCH_ALT ARG
ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu

99
# Install Rust
100
101
102
103
104
105
RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
    rm rustup-init && \
    chmod -R a+w $RUSTUP_HOME $CARGO_HOME

106
107
108
109
110
##################################
########## System Dependencies ###
##################################

# Install system packages
111
112
113
114
115
116
117
RUN apt-get update -y \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # NIXL build dependencies
        autoconf \
        automake \
        cmake \
        git \
118
        git-lfs \
119
120
121
122
123
124
125
126
127
128
129
130
        libtool \
        meson \
        net-tools \
        ninja-build \
        pybind11-dev \
        # Rust build dependencies
        clang \
        libclang-dev \
        protobuf-compiler \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# These headers are missing with the hpcx installer, required
# by UCX to build and use RDMA devices. Reinstall to make sure to recreate
# symlink .so to .so.1 in case some packages are already found.
RUN apt-get update -y \
    && DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall --no-install-recommends \
        libibverbs-dev \
        rdma-core \
        ibverbs-utils \
        libibumad-dev \
        libnuma-dev \
        librdmacm-dev \
        ibverbs-providers \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

146
147
148
149
150
##################################
########## External Services #####
##################################

# Install NATS server
151
152
153
154
155
ENV NATS_VERSION="v2.10.28"
RUN --mount=type=cache,target=/var/cache/apt \
    wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${ARCH}.deb && \
    dpkg -i nats-server-${NATS_VERSION}-${ARCH}.deb && rm nats-server-${NATS_VERSION}-${ARCH}.deb

156
# Install etcd
157
158
159
160
161
162
163
ENV ETCD_VERSION="v3.5.21"
RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
    mkdir -p /usr/local/bin/etcd && \
    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
    rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd/:$PATH

164
165
166
167
168
##################################
########## UCX Build #############
##################################

# Build and install UCX
169
170
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
171
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
172
    rm -rf /opt/hpcx/ucx && \
173
174
175
    rm -rf /usr/local/ucx && \
    echo "Building UCX with reference $NIXL_UCX_REF" && \
    cd /usr/local/src && \
176
177
    git clone https://github.com/openucx/ucx.git && \
    cd ucx && git checkout $NIXL_UCX_REF && \
178
179
180
181
    CC=${USE_SCCACHE:+sccache gcc} && \
    CXX=${USE_SCCACHE:+sccache g++} && \
    export CC=${CC} && \
    export CXX=${CXX} && \
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
    ./autogen.sh && \
    ./configure \
      --prefix=/usr/local/ucx \
      --enable-shared \
      --disable-static \
      --disable-doxygen-doc \
      --enable-optimizations \
      --enable-cma \
      --enable-devel-headers \
      --with-cuda=/usr/local/cuda \
      --with-verbs \
      --with-efa \
      --with-dm \
      --with-gdrcopy=/usr/local \
      --enable-mt && \
    make -j$(nproc) &&                              \
    make -j$(nproc) install-strip &&                \
199
    /tmp/use-sccache.sh show-stats "UCX" && \
200
201
202
203
204
205
206
    echo "/usr/local/ucx/lib" > /etc/ld.so.conf.d/ucx.conf && \
    echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
    ldconfig && \
    cd /usr/local/src && \
    rm -rf ucx

# UCX environment variables
207
ENV CPATH=/usr/include \
Alec's avatar
Alec committed
208
    PATH=/usr/bin:/usr/local/ucx/bin:$PATH \
209
    PKG_CONFIG_PATH=/usr/lib/pkgconfig
210

211
212
213
214
215
216
217
218
219
220
221
##################################
########## NIXL Setup ############
##################################

# NIXL environment setup
ENV NIXL_SRC_DIR=/opt/nixl \
    NIXL_PREFIX=/opt/nvidia/nvda_nixl \
    NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins

# Build and install NIXL
222
223
224
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \
225
226
227
228
229
230
231
    cd ${NIXL_SRC_DIR} && \
    if [ "$ARCH" = "arm64" ]; then \
        nixl_build_args="-Ddisable_gds_backend=true"; \
    else \
        nixl_build_args=""; \
    fi && \
    meson setup build/ --buildtype=release --prefix=$NIXL_PREFIX $nixl_build_args && \
232
233
    ninja -C build/ -j$(nproc) && ninja -C build/ install && \
    /tmp/use-sccache.sh show-stats "NIXL" && \
234
235
236
237
    echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
    echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \
    ldconfig

238
# Build NIXL Python module
239
# TODO OPS-590: Move gds_path selection based on arch into NIXL build and re-enable gds backend for arm64
240
241
242
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    if [ "$ARCH" = "arm64" ]; then \
243
        cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl --python $PYTHON_VERSION \
244
245
        --config-settings=setup-args="-Ddisable_gds_backend=true"; \
    else \
246
        cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl --python $PYTHON_VERSION; \
247
248
    fi

249
250
251
252
253
254
##################################
########## Python Environment ####
##################################

# Create and activate virtual environment
ARG PYTHON_VERSION
255
RUN mkdir -p /opt/dynamo/venv && \
256
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION
257
258
259
260
261
262
263

ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"

# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
264
265
266
267
    UV_GIT_LFS=1 uv pip install \
        --no-cache \
        --requirement /tmp/requirements.txt \
        --requirement /tmp/requirements.test.txt
268
269
270
271
272
273
274
275
276
277

##################################
##### Wheel Build Image ##########
##################################

# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction
ARG ARCH_ALT

FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder

278
279
280
# Redeclare ARGs for this stage
ARG ARCH
ARG ARCH_ALT
281
ARG CARGO_BUILD_JOBS
282
ARG ENABLE_KVBM
283
284
285
ARG USE_SCCACHE
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
286
287
288

WORKDIR /opt/dynamo

289
290
291
# Set environment variables
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
    RUSTUP_HOME=/usr/local/rustup \
292
293
294
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/opt/dynamo/target \
    VIRTUAL_ENV=/opt/dynamo/venv \
295
296
297
298
299
    NIXL_PREFIX=/opt/nvidia/nvda_nixl \
    PATH=/usr/local/cargo/bin:/opt/dynamo/venv/bin:$PATH

# Install system dependencies
RUN dnf update -y \
300
    && dnf install -y llvm-toolset protobuf-compiler wget unzip \
301
302
    && dnf clean all \
    && rm -rf /var/cache/dnf
303

304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \
    PROTOC_VERSION=25.3; \
    case "${ARCH_ALT}" in \
      x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \
      aarch64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-aarch_64.zip" ;; \
      *) echo "Unsupported architecture: ${ARCH_ALT}" >&2; exit 1 ;; \
    esac; \
    wget --tries=3 --waitretry=5 -O /tmp/protoc.zip "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/${PROTOC_ZIP}"; \
    rm -f /usr/local/bin/protoc /usr/bin/protoc; \
    unzip -o /tmp/protoc.zip -d /usr/local bin/protoc include/*; \
    chmod +x /usr/local/bin/protoc; \
    ln -s /usr/local/bin/protoc /usr/bin/protoc; \
    protoc --version

# Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc

322
# Copy artifacts from base stage
323
324
325
COPY --from=base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=base $CARGO_HOME $CARGO_HOME
COPY --from=base $NIXL_PREFIX $NIXL_PREFIX
326
327
328
329
330
331
332
333

ARG PYTHON_VERSION
RUN mkdir -p /opt/dynamo/venv && \
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION

ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"

334

335
336
337
338
339
340
341
342
343
344
# Install SCCACHE if requested
COPY container/use-sccache.sh /tmp/use-sccache.sh
RUN if [ "$USE_SCCACHE" = "true" ]; then \
        /tmp/use-sccache.sh install; \
    fi

# Set SCCACHE environment variables
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \
    RUSTC_WRAPPER=${USE_SCCACHE:+sccache}
345

346
347
# Copy source code (order matters for layer caching)
COPY pyproject.toml README.md LICENSE Cargo.toml Cargo.lock rust-toolchain.toml hatch_build.py /opt/dynamo/
348
349
350
COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/

351
# Build wheels
352
353
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
354
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
355
    uv build --wheel --out-dir /opt/dynamo/dist && \
356
357
    cd /opt/dynamo/lib/bindings/python && \
    uv pip install maturin[patchelf] && \
358
359
360
361
362
    if [ "$ENABLE_KVBM" = "true" ]; then \
        maturin build --release --features block-manager --out /opt/dynamo/dist; \
    else \
        maturin build --release --out /opt/dynamo/dist; \
    fi && \
363
    /tmp/use-sccache.sh show-stats "Dynamo"
364
365
366
367

##############################################
########## Dev entrypoint image ##############
##############################################
368

369
370
371
372
FROM base AS dev

# Application environment variables
ENV DYNAMO_HOME=/opt/dynamo \
373
    CARGO_TARGET_DIR=/opt/dynamo/target
374
375
376

WORKDIR /opt/dynamo

377
# Copy built artifacts
378
379
380
381
COPY --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
COPY --from=wheel_builder $CARGO_TARGET_DIR $CARGO_TARGET_DIR
COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME

382
# Install Python packages
383
384
COPY benchmarks/ /opt/dynamo/benchmarks/
RUN uv pip install \
385
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
386
387
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl \
388
    && cd /opt/dynamo/benchmarks \
389
    && UV_GIT_LFS=1 uv pip install --no-cache . \
390
391
    && cd - \
    && rm -rf /opt/dynamo/benchmarks
392

393
# Setup launch banner
394
395
396
397
398
RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \
    sed '/^#\s/d' /opt/dynamo/launch_message.txt > ~/.launch_screen && \
    echo "cat ~/.launch_screen" >> ~/.bashrc

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
399
CMD []