Dockerfile.trtllm 38.6 KB
Newer Older
1
# syntax=docker/dockerfile:1.10.0
2
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4
#
5
6
7
8
9
10
11
12
13
14
15
16
# NOTE FOR dynamo_base AND wheel_builder STAGES:
#
# All changes to dynamo_base and wheel_builder stages should be replicated across
# Dockerfile and Dockerfile.<framework> images.:
#   - Dockerfile
#   - Dockerfile.vllm
#   - Dockerfile.sglang
#   - Dockerfile.trtllm
# This duplication was introduced purposely to quickly enable Docker layer caching and
# deduplication. Please ensure these stages stay in sync until the duplication can be
# addressed.
#
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Throughout this file, we make certain paths group-writable because this allows
# both the dynamo user (UID 1000) and Dev Container users (UID != 1000) to work
# properly without needing slow chown -R operations (which can add 2-10 extra
# minutes).
#
# DEVELOPMENT PATHS THAT MUST BE GROUP-WRITABLE (for virtualenv containers):
#   /workspace            - Users create/modify project files
#   /home/dynamo          - Users create config/cache files
#   /opt/dynamo/venv      - TensorRT-LLM uses venv, so entire venv must be writable for pip install
#
# HOW TO ACHIEVE GROUP-WRITABLE PERMISSIONS:
# 1. SHELL + /etc/profile.d - Login shell sources umask 002 globally for all RUN commands (775/664)
# 2. COPY --chmod=775       - Sets permissions on copied children (not destination)
# 3. chmod g+w (no -R)      - Fixes destination dirs only (milliseconds vs minutes)
31

32
33
34
35
36
37
38
# This section contains build arguments that are common and shared with
# the plain Dockerfile, so they should NOT have a default. The source of truth is from build.sh.
ARG BASE_IMAGE
ARG BASE_IMAGE_TAG

ARG PYTHON_VERSION
ARG ENABLE_KVBM
39
ARG ENABLE_MEDIA_NIXL
40
ARG ENABLE_MEDIA_FFMPEG
41
ARG CARGO_BUILD_JOBS
42
43

ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
44
45
46
ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG RUNTIME_IMAGE_TAG="25.10-cuda13.0-runtime-ubuntu24.04"
47

48
49
50
# TensorRT-LLM specific configuration
ARG HAS_TRTLLM_CONTEXT=0
ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
51
ARG TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
52
ARG GITHUB_TRTLLM_COMMIT
53

54
55
56
57
58
59
60
61
62
# SCCACHE configuration
ARG USE_SCCACHE
ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION=""

# NIXL configuration
ARG NIXL_UCX_REF
ARG NIXL_REF
ARG NIXL_GDRCOPY_REF
Joe Chandler's avatar
Joe Chandler committed
63
ARG NIXL_LIBFABRIC_REF
64

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
#   ARCH: Used for package suffixes (e.g., amd64, arm64)
#   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
#
# Default values are for x86/amd64:
#   --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
#
# For arm64/aarch64, build with:
#   --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
#
# NOTE: There isn't an easy way to define one of these values based on the other value
# without adding if statements everywhere, so just define both as ARGs for now.
ARG ARCH=amd64
ARG ARCH_ALT=x86_64

80
# Copy artifacts from NGC PyTorch image
81
FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_IMAGE_TAG} AS pytorch_base
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
##################################
########## Base Image ############
##################################

FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynamo_base

ARG ARCH
ARG ARCH_ALT

USER root
WORKDIR /opt/dynamo

# Install uv package manager
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

# Install NATS server
ENV NATS_VERSION="v2.10.28"
RUN --mount=type=cache,target=/var/cache/apt \
    wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${ARCH}.deb && \
    dpkg -i nats-server-${NATS_VERSION}-${ARCH}.deb && rm nats-server-${NATS_VERSION}-${ARCH}.deb

# Install etcd
ENV ETCD_VERSION="v3.5.21"
RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
    mkdir -p /usr/local/bin/etcd && \
    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
    rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd/:$PATH

# Rust Setup
# Rust environment setup
ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    PATH=/usr/local/cargo/bin:$PATH \
    RUST_VERSION=1.90.0

# Define Rust target based on ARCH_ALT ARG
ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu

# Install Rust
RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
    rm rustup-init && \
    chmod -R a+w $RUSTUP_HOME $CARGO_HOME


##################################
##### Wheel Build Image ##########
##################################

# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction
ARG ARCH_ALT

FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder

# Redeclare ARGs for this stage
ARG ARCH
ARG ARCH_ALT
ARG CARGO_BUILD_JOBS

WORKDIR /workspace

# Copy CUDA from base stage
COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda
COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf

# Set environment variables first so they can be used in COPY commands
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
    RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/opt/dynamo/target \
    PATH=/usr/local/cargo/bin:$PATH

# Copy artifacts from base stage
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
# Install system dependencies
RUN yum groupinstall -y 'Development Tools' &&  \
    dnf install -y almalinux-release-synergy && \
    dnf config-manager --set-enabled powertools && \
    dnf install -y \
        # Build tools
        cmake \
        ninja-build \
        clang-devel \
169
170
171
172
        # Install GCC toolset 14 (CUDA compatible, max version 14)
        gcc-toolset-14-gcc \
        gcc-toolset-14-gcc-c++ \
        gcc-toolset-14-binutils \
173
174
175
176
177
178
179
180
181
182
183
184
185
186
        flex \
        wget \
        # Kernel module build dependencies
        dkms \
        # Protobuf support
        protobuf-compiler \
        # RDMA/InfiniBand support (required for UCX build with --with-verbs)
        libibverbs \
        libibverbs-devel \
        rdma-core \
        rdma-core-devel \
        libibumad \
        libibumad-devel \
        librdmacm-devel \
Joe Chandler's avatar
Joe Chandler committed
187
188
189
190
        numactl-devel \
        # Libfabric support
        hwloc \
        hwloc-devel
191

192
193
194
195
196
197
# Set GCC toolset 14 as the default compiler (CUDA requires GCC <= 14)
ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
    LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \
    CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \
    CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++"

198

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \
    PROTOC_VERSION=25.3; \
    case "${ARCH_ALT}" in \
      x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \
      aarch64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-aarch_64.zip" ;; \
      *) echo "Unsupported architecture: ${ARCH_ALT}" >&2; exit 1 ;; \
    esac; \
    wget --tries=3 --waitretry=5 -O /tmp/protoc.zip "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/${PROTOC_ZIP}"; \
    rm -f /usr/local/bin/protoc /usr/bin/protoc; \
    unzip -o /tmp/protoc.zip -d /usr/local bin/protoc include/*; \
    chmod +x /usr/local/bin/protoc; \
    ln -s /usr/local/bin/protoc /usr/bin/protoc; \
    protoc --version

# Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc

ENV CUDA_PATH=/usr/local/cuda \
    PATH=/usr/local/cuda/bin:$PATH \
    LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \
    NVIDIA_DRIVER_CAPABILITIES=video,compute,utility

# Create virtual environment for building wheels
ARG PYTHON_VERSION
ENV VIRTUAL_ENV=/workspace/.venv
RUN uv venv ${VIRTUAL_ENV} --python $PYTHON_VERSION && \
226
    uv pip install --upgrade meson pybind11 patchelf maturin[patchelf] tomlkit
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

ARG NIXL_UCX_REF
ARG NIXL_REF
ARG NIXL_GDRCOPY_REF

# Build and install gdrcopy
RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
    cd gdrcopy/packages && \
    CUDA=/usr/local/cuda ./build-rpm-packages.sh && \
    rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
    rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \
    rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm

# Install SCCACHE if requested
ARG USE_SCCACHE
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
COPY container/use-sccache.sh /tmp/use-sccache.sh
RUN if [ "$USE_SCCACHE" = "true" ]; then \
        /tmp/use-sccache.sh install; \
    fi

# Set SCCACHE environment variables
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
251
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}}
252

253
# Build FFmpeg from source
milesial's avatar
milesial committed
254
# Do not delete the source tarball for legal reasons
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
ARG FFMPEG_VERSION=7.1
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
if [ "$ENABLE_MEDIA_FFMPEG" = "true" ]; then \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        export CMAKE_C_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CXX_COMPILER_LAUNCHER="sccache" && \
        export RUSTC_WRAPPER="sccache"; \
    fi && \
    dnf install -y pkg-config && \
    cd /tmp && \
    curl -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \
    tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \
    cd ffmpeg-${FFMPEG_VERSION} && \
    ./configure \
        --prefix=/usr/local \
milesial's avatar
milesial committed
272
273
        --disable-gpl \
        --disable-nonfree \
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
        --disable-programs \
        --disable-doc \
        --disable-static \
        --disable-x86asm \
        --disable-postproc \
        --disable-network \
        --disable-encoders \
        --disable-muxers \
        --disable-bsfs \
        --disable-devices \
        --disable-libdrm \
        --enable-shared && \
    make -j$(nproc) && \
    make install && \
    /tmp/use-sccache.sh show-stats "FFMPEG" && \
    ldconfig && \
milesial's avatar
milesial committed
290
291
    mkdir -p /usr/local/src/ffmpeg && \
    mv /tmp/ffmpeg-${FFMPEG_VERSION}* /usr/local/src/ffmpeg/; \
292
293
fi

294
295
296
297
# Build and install UCX
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
298
299
300
301
302
    if [ "$USE_SCCACHE" = "true" ]; then \
        export CMAKE_C_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CXX_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"; \
    fi && \
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
    cd /usr/local/src && \
     git clone https://github.com/openucx/ucx.git && \
     cd ucx && 			     \
     git checkout $NIXL_UCX_REF &&	 \
     ./autogen.sh &&      \
     ./contrib/configure-release    \
        --prefix=/usr/local/ucx     \
        --enable-shared             \
        --disable-static            \
        --disable-doxygen-doc       \
        --enable-optimizations      \
        --enable-cma                \
        --enable-devel-headers      \
        --with-cuda=/usr/local/cuda \
        --with-verbs                \
        --with-dm                   \
        --with-gdrcopy=/usr/local   \
        --with-efa                  \
        --enable-mt &&              \
     make -j &&                      \
     make -j install-strip &&        \
     /tmp/use-sccache.sh show-stats "UCX" && \
     echo "/usr/local/ucx/lib" > /etc/ld.so.conf.d/ucx.conf && \
     echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
     ldconfig

Joe Chandler's avatar
Joe Chandler committed
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        export CMAKE_C_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CXX_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"; \
    fi && \
    cd /usr/local/src && \
    git clone https://github.com/ofiwg/libfabric.git && \
    cd libfabric && \
    git checkout $NIXL_LIBFABRIC_REF && \
    ./autogen.sh && \
    ./configure --prefix="/usr/local/libfabric" \
                --disable-verbs \
                --disable-psm3 \
                --disable-opx \
                --disable-usnic \
                --disable-rstream \
                --enable-efa \
                --with-cuda=/usr/local/cuda \
                --enable-cuda-dlopen \
                --with-gdrcopy \
                --enable-gdrcopy-dlopen && \
    make -j$(nproc) && \
    make install && \
    /tmp/use-sccache.sh show-stats "LIBFABRIC" && \
    echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \
    ldconfig

360
361
362
363
# build and install nixl
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
364
365
366
367
368
    if [ "$USE_SCCACHE" = "true" ]; then \
        export CMAKE_C_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CXX_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"; \
    fi && \
369
    source ${VIRTUAL_ENV}/bin/activate && \
370
    git clone "https://github.com/ai-dynamo/nixl.git" && \
371
    cd nixl && \
372
373
374
375
376
377
378
379
    git checkout ${NIXL_REF} && \
    CUDA_MAJOR=$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | cut -d'.' -f1) && \
    if [ "$CUDA_MAJOR" -ne 12 ] && [ "$CUDA_MAJOR" -ne 13 ]; then \
        echo "Invalid CUDA_MAJOR: '$CUDA_MAJOR'" && \
        exit 1; \
    fi && \
    PKG_NAME="nixl-cu${CUDA_MAJOR}" && \
    ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \
380
381
382
383
    mkdir build && \
    meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
    -Dcudapath_lib="/usr/local/cuda/lib64" \
    -Dcudapath_inc="/usr/local/cuda/include" \
Joe Chandler's avatar
Joe Chandler committed
384
385
    -Ducx_path="/usr/local/ucx" \
    -Dlibfabric_path="/usr/local/libfabric" && \
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
    cd build && \
    ninja && \
    ninja install && \
    /tmp/use-sccache.sh show-stats "NIXL"

ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64  \
    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
    NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}

RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
    echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \
    ldconfig

RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
403
404
405
406
407
    if [ "$USE_SCCACHE" = "true" ]; then \
        export CMAKE_C_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CXX_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"; \
    fi && \
408
    cd /workspace/nixl && \
409
    uv build . --wheel --out-dir /opt/dynamo/dist/nixl --python $PYTHON_VERSION
410
411
412
413
414
415
416
417
418

# Copy source code (order matters for layer caching)
COPY pyproject.toml README.md LICENSE Cargo.toml Cargo.lock rust-toolchain.toml hatch_build.py /opt/dynamo/
COPY launch/ /opt/dynamo/launch/
COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/

# Build dynamo wheels
ARG ENABLE_KVBM
419
ARG USE_SCCACHE
420
421
422
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
423
424
425
426
427
    if [ "$USE_SCCACHE" = "true" ]; then \
        export CMAKE_C_COMPILER_LAUNCHER="sccache" && \
        export CMAKE_CXX_COMPILER_LAUNCHER="sccache" && \
        export RUSTC_WRAPPER="sccache"; \
    fi && \
428
429
430
431
    source ${VIRTUAL_ENV}/bin/activate && \
    cd /opt/dynamo && \
    uv build --wheel --out-dir /opt/dynamo/dist && \
    cd /opt/dynamo/lib/bindings/python && \
432
    FEATURES=""; \
433
    if [ "$ENABLE_MEDIA_NIXL" = "true" ]; then \
434
435
436
437
438
439
440
        FEATURES="$FEATURES dynamo-llm/media-nixl"; \
    fi; \
    if [ "$ENABLE_MEDIA_FFMPEG" = "true" ]; then \
        FEATURES="$FEATURES media-ffmpeg"; \
    fi; \
    if [ -n "$FEATURES" ]; then \
        maturin build --release --features "$FEATURES" --out /opt/dynamo/dist; \
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
    else \
        maturin build --release --out /opt/dynamo/dist; \
    fi && \
    if [ "$ENABLE_KVBM" = "true" ]; then \
        cd /opt/dynamo/lib/bindings/kvbm && \
        maturin build --release --out target/wheels && \
        auditwheel repair \
            --exclude libnixl.so \
            --exclude libnixl_build.so \
            --exclude libnixl_common.so \
            --plat manylinux_2_28_${ARCH_ALT} \
            --wheel-dir /opt/dynamo/dist \
            target/wheels/*.whl; \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"

457
##################################################
458
########## Framework Builder Stage ##############
459
460
##################################################
#
461
# PURPOSE: Build TensorRT-LLM with root privileges
462
#
463
464
465
466
# This stage handles TensorRT-LLM installation which requires:
# - Root access for apt operations (CUDA repos, TensorRT installation)
# - System-level modifications in install_tensorrt.sh
# - Virtual environment population with PyTorch and TensorRT-LLM
467
#
468
# The completed venv is then copied to runtime stage with dynamo ownership
469

470
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework
471

472
ARG ARCH_ALT
473
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
474

475
# Install minimal dependencies needed for TensorRT-LLM installation
476
ARG PYTHON_VERSION
477
478
RUN apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
479
        python${PYTHON_VERSION}-dev \
480
        python3-pip \
481
        curl \
482
483
484
        git \
        git-lfs \
        ca-certificates && \
485
    apt-get clean && \
486
487
    rm -rf /var/lib/apt/lists/*

488
489
490
# Create virtual environment
RUN mkdir -p /opt/dynamo/venv && \
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION
491

492
493
494
ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"

495
# Copy pytorch installation from NGC PyTorch
496
497
498
ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10
ARG TORCH_TENSORRT_VER=2.9.0a0
ARG TORCHVISION_VER=0.24.0a0+094e7af5
499
500
ARG JINJA2_VER=3.1.6
ARG SYMPY_VER=1.14.0
501
ARG FLASH_ATTN_VER=2.7.4.post1+25.10
502

503
504
505
506
507
508
509
510
511
512
513
514
515
516
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchgen ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchgen
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision-${TORCHVISION_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision-${TORCHVISION_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchvision.libs ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchvision.libs
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/functorch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/functorch
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/jinja2 ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/jinja2
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/jinja2-${JINJA2_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/jinja2-${JINJA2_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sympy ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/sympy
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sympy-${SYMPY_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/sympy-${SYMPY_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/
517
518
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info
519

520
521
522
523
# Install TensorRT-LLM and related dependencies
ARG HAS_TRTLLM_CONTEXT
ARG TENSORRTLLM_PIP_WHEEL
ARG TENSORRTLLM_INDEX_URL
524
ARG GITHUB_TRTLLM_COMMIT
525

526
# Copy only wheel files and commit info from trtllm_wheel stage from build_context
527
COPY --from=trtllm_wheel /*.whl /trtllm_wheel/
528
529
COPY --from=trtllm_wheel /*.txt /trtllm_wheel/

530
RUN uv pip install --no-cache "cuda-python==13.0.2"
531

532
533
534
535
# Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
# because there might be mismatched versions of TensorRT between the NGC PyTorch
# and the TRTLLM wheel.
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
536
537
538
    # Clean up any existing conflicting CUDA repository configurations and GPG keys
    rm -f /etc/apt/sources.list.d/cuda*.list && \
    rm -f /usr/share/keyrings/cuda-archive-keyring.gpg && \
539
540
541
    rm -f /etc/apt/trusted.gpg.d/cuda*.gpg

RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
542
        # Download and run install_tensorrt.sh from TensorRT-LLM GitHub before installing the wheel
543
        curl -fsSL --retry 5 --retry-delay 10 --max-time 1800 -o /tmp/install_tensorrt.sh "https://github.com/NVIDIA/TensorRT-LLM/raw/${GITHUB_TRTLLM_COMMIT}/docker/common/install_tensorrt.sh" && \
544
545
546
        # Modify the script to use virtual environment pip instead of system pip3
        sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
        bash /tmp/install_tensorrt.sh && \
547
        # Install from local wheel directory in build context
548
        WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
549
        if [ -n "$WHEEL_FILE" ]; then \
550
            uv pip install --no-cache "$WHEEL_FILE" triton==3.5.0; \
551
552
553
554
555
        else \
            echo "No wheel file found in /trtllm_wheel directory."; \
            exit 1; \
        fi; \
    else \
556
        # Download and run install_tensorrt.sh from TensorRT-LLM GitHub before installing the wheel
557
        TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/.*==([0-9a-zA-Z.+-]+).*/\1/') && \
558
559
560
561
562
        (curl -fsSL --retry 5 --retry-delay 10 --max-time 1800 -o /tmp/install_tensorrt.sh "https://github.com/NVIDIA/TensorRT-LLM/raw/v${TRTLLM_VERSION}/docker/common/install_tensorrt.sh" || \
         curl -fsSL --retry 5 --retry-delay 10 --max-time 1800 -o /tmp/install_tensorrt.sh "https://github.com/NVIDIA/TensorRT-LLM/raw/${GITHUB_TRTLLM_COMMIT}/docker/common/install_tensorrt.sh") && \
        # Modify the script to use virtual environment pip instead of system pip3
        sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
        bash /tmp/install_tensorrt.sh && \
563
        # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
Nate Mailhot's avatar
Nate Mailhot committed
564
        # TRTLLM 1.2.0rc6 has issues installing from pypi with uv, installing from direct wheel link works best
565
        # explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
566
567
568
569
570
571
572
573
        if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
            TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
            PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \
            DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \
            uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.0; \
        else \
            uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" triton==3.5.0; \
        fi; \
574
575
    fi

576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
##################################################
########## Runtime Image ########################
##################################################
#
# PURPOSE: Production runtime environment
#
# This stage creates a lightweight production-ready image containing:
# - Pre-compiled TensorRT-LLM and framework dependencies
# - Dynamo runtime libraries and Python packages
# - Essential runtime dependencies and configurations
# - Optimized for inference workloads and deployment
#
# Use this stage when you need:
# - Production deployment of Dynamo with TensorRT-LLM
# - Minimal runtime footprint without build tools
# - Ready-to-run inference server environment
# - Base for custom application containers
#

FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime

ARG ARCH_ALT
WORKDIR /workspace
ENV ENV=${ENV:-/etc/shinit_v2}
ENV VIRTUAL_ENV=/opt/dynamo/venv
601
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
602
603
# workaround for pickle lib issue
ENV OMPI_MCA_coll_ucc_enable=0
604

605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image
COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
COPY --from=pytorch_base /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas
COPY --from=pytorch_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
COPY --from=pytorch_base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/
COPY --from=pytorch_base /usr/local/cuda/bin/cuobjdump /usr/local/cuda/bin/cuobjdump
COPY --from=pytorch_base /usr/local/cuda/bin/nvdisasm /usr/local/cuda/bin/nvdisasm

ENV CUDA_HOME=/usr/local/cuda \
    TRITON_CUPTI_PATH=/usr/local/cuda/include \
    TRITON_CUDACRT_PATH=/usr/local/cuda/include \
    TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
    TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
    TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
    TRITON_CUDART_PATH=/usr/local/cuda/include

# Copy OpenMPI from PyTorch base image
COPY --from=pytorch_base /opt/hpcx/ompi /opt/hpcx/ompi
# Copy NUMA library from PyTorch base image
COPY --from=pytorch_base /usr/lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/

# Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/
COPY --from=pytorch_base /opt/hpcx /opt/hpcx
# This is needed to make libucc.so visible so pytorch can use it.
ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}"
# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them
# pytorch-triton is copied after trtllm installation.
COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/

# Copy nats and etcd from dynamo_base image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH

# Copy uv to system /bin
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/

# Create dynamo user with group 0 for OpenShift compatibility
RUN userdel -r ubuntu > /dev/null 2>&1 || true \
    && useradd -m -s /bin/bash -g 0 dynamo \
    && [ `id -u dynamo` -eq 1000 ] \
    && mkdir -p /home/dynamo/.cache /opt/dynamo \
654
655
656
657
658
659
    # Non-recursive chown - only the directories themselves, not contents
    && chown dynamo:0 /home/dynamo /home/dynamo/.cache /opt/dynamo /workspace \
    # No chmod needed: umask 002 handles new files, COPY --chmod handles copied content
    # Set umask globally for all subsequent RUN commands (must be done as root before USER dynamo)
    # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
    && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
660

661
# Install Python, build-essential and python3-dev as apt dependencies
662
ARG PYTHON_VERSION
663
664
665
666
667
668
669
670
671
672
673
674
RUN if [ ${ARCH_ALT} = "x86_64" ]; then \
        ARCH_FOR_GPG=${ARCH_ALT}; \
    else \
        ARCH_FOR_GPG="sbsa"; \
    fi && \
    curl -fsSL \
        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG}/cuda-archive-keyring.gpg \
        -o /usr/share/keyrings/cuda-archive-keyring.gpg &&\
    echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] \
        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG} /" \
        | tee /etc/apt/sources.list.d/cuda.repo.list > /dev/null &&\
    apt-get update && \
675
676
677
678
679
680
681
682
683
684
    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Build tools
        build-essential \
        g++ \
        ninja-build \
        git \
        git-lfs \
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        python3-pip \
685
686
        # jq for polling various endpoints and health checks
        jq \
687
        # CUDA/ML libraries
688
689
        libcudnn9-cuda-13 \
        libnvshmem3-cuda-13 \
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
        # Network and communication libraries
        libzmq3-dev \
        # RDMA/UCX libraries required to find RDMA devices
        ibverbs-providers \
        ibverbs-utils \
        libibumad3 \
        libibverbs1 \
        libnuma1 \
        librdmacm1 \
        rdma-core \
        # OpenMPI dependencies
        openssh-client \
        openssh-server \
        # System utilities and dependencies
        curl && \
    apt-get clean && \
706
707
708
    rm -rf /var/lib/apt/lists/* && \
    # Create libnccl.so symlink pointing to libnccl.so.2. TensorRT-LLM requires explicit libnccl.so
    ln -sf /usr/lib/${ARCH_ALT}-linux-gnu/libnccl.so.2 /usr/lib/${ARCH_ALT}-linux-gnu/libnccl.so
709

710
711
712
# Switch to dynamo user
USER dynamo
ENV HOME=/home/dynamo
713
714
715
# This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]

716
717
718
719
ENV DYNAMO_HOME=/workspace
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
720
721
722
723
724

# Copy libgomp.so from framework image
COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
COPY --from=framework /usr/lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/

725
# Copy pre-built venv with PyTorch and TensorRT-LLM from framework stage
726
727
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
728
729
730

# Copy UCX from framework image as plugin for NIXL
# Copy NIXL source from framework image
731
# Copy dynamo wheels for gitlab artifacts (read-only, no group-write needed)
732
733
734
735
736
COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
737

738
739
740
741
742
# Copy ffmpeg
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
    cp -rnL /tmp/usr/local/include/libav* /tmp/usr/local/include/libsw* /usr/local/include/; \
    cp -nL /tmp/usr/local/lib/libav*.so /tmp/usr/local/lib/libsw*.so /usr/local/lib/; \
    cp -nL /tmp/usr/local/lib/pkgconfig/libav*.pc /tmp/usr/local/lib/pkgconfig/libsw*.pc /usr/lib/pkgconfig/; \
milesial's avatar
milesial committed
743
    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/; \
744
745
    true # in case ffmpeg not enabled

746
ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
747
748
749
750
751
752
753
ENV PATH="/usr/local/ucx/bin:${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH"
ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
/opt/hpcx/ompi/lib:\
754
755
756
757
/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:\
$TENSORRT_LIB_DIR:\
/opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch/lib:\
/opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt/lib:\
758
759
/usr/local/cuda/lib:\
/usr/local/cuda/lib64:\
760
$LD_LIBRARY_PATH
761
ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
762
763
ENV OPAL_PREFIX=/opt/hpcx/ompi

764
765
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
766

767
# Install dynamo, NIXL, and dynamo-specific dependencies
768
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
769
ARG ENABLE_KVBM
770
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
771
RUN uv pip install \
772
      --no-cache \
Richard Huo's avatar
Richard Huo committed
773
774
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
775
776
777
778
779
780
781
782
783
784
      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
    if [ "${ENABLE_KVBM}" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \
            echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
            exit 1; \
        fi; \
        uv pip install --no-cache "$KVBM_WHEEL"; \
    fi && \
    cd /workspace/benchmarks && \
785
786
787
    UV_GIT_LFS=1 uv pip install --no-cache . && \
    # pip/uv bypasses umask when creating .egg-info files, but chmod -R is fast here (small directory)
    chmod -R g+w /workspace/benchmarks
788
789
790
791

# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
792
793
    UV_GIT_LFS=1 uv pip install \
        --no-cache \
794
795
        --index-strategy unsafe-best-match \
        --extra-index-url https://download.pytorch.org/whl/cu130 \
796
        --requirement /tmp/requirements.txt \
797
798
        --requirement /tmp/requirements.test.txt \
        cupy-cuda13x
799

800
801
802
803
804
805
806
# Copy tests, deploy and components for CI with correct ownership
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
COPY --chmod=775 --chown=dynamo:0 deploy /workspace/deploy
COPY --chmod=775 --chown=dynamo:0 components/ /workspace/components/
COPY --chmod=775 --chown=dynamo:0 recipes/ /workspace/recipes/
807

808
# Setup launch banner in common directory accessible to all users
809
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
810
    sed '/^#\s/d' /opt/dynamo/launch_message.txt > /opt/dynamo/.launch_screen
811

812
813
# Setup environment for all users
USER root
814
815
816
817
# Fix directory permissions: COPY --chmod only affects contents, not the directory itself
RUN chmod g+w ${VIRTUAL_ENV} /workspace /workspace/* /opt/dynamo /opt/dynamo/* && \
    chown dynamo:0 ${VIRTUAL_ENV} /workspace /opt/dynamo/ && \
    chmod 755 /opt/dynamo/.launch_screen && \
818
819
820
821
    echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
    echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc

USER dynamo
822
823
824
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA

825
826
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845

###########################################################
########## Development (run.sh, runs as root user) ########
###########################################################
#
# PURPOSE: Local development environment for use with run.sh (not Dev Container plug-in)
#
# This stage runs as root and provides:
# - Development tools and utilities for local debugging
# - Support for vscode/cursor development outside the Dev Container plug-in
#
# Use this stage if you need a full-featured development environment with extra tools,
# but do not use it with the Dev Container plug-in.

FROM runtime AS dev

# Don't want ubuntu to be editable, just change uid and gid.
ARG WORKSPACE_DIR=/workspace

846
847
848
# Switch to root for system package installation
USER root

849
850
851
# Install utilities as root
RUN apt-get update -y && \
    apt-get install -y --no-install-recommends  \
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
        # Install utilities
        nvtop \
        wget \
        tmux \
        vim \
        git \
        iproute2 \
        rsync \
        zip \
        unzip \
        htop \
        # Build Dependencies
        autoconf \
        automake \
        cmake \
        libtool \
        meson \
        net-tools \
        pybind11-dev \
        # Rust build dependencies
        clang \
        libclang-dev \
874
875
        protobuf-compiler \
        pkg-config && \
876
    apt-get clean && \
877
878
    rm -rf /var/lib/apt/lists/*

879
880
881
882
# Set umask for group-writable files in dev stage (runs as root)
RUN mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]

883
884
885
886
887
888
889
890
891
# Set workspace directory variable
ENV WORKSPACE_DIR=${WORKSPACE_DIR} \
    DYNAMO_HOME=${WORKSPACE_DIR} \
    RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/workspace/target \
    VIRTUAL_ENV=/opt/dynamo/venv \
    PATH=/usr/local/cargo/bin:$PATH

892
893
894
895
896
# Copy rust installation from dynamo_base to avoid duplication efforts
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> because COPY --chmod only affects <path>/*, not <path>
COPY --from=dynamo_base --chmod=775 /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base --chmod=775 /usr/local/cargo /usr/local/cargo
RUN chmod g+w /usr/local/rustup /usr/local/cargo
897

898
# Install maturin, for maturin develop
899
RUN uv pip install --no-cache maturin[patchelf]
900

901
# Editable install of dynamo
902
COPY --chown=dynamo:0 --chmod=664 pyproject.toml README.md hatch_build.py /workspace/
903
RUN uv pip install --no-cache --no-deps -e .
904

Alec's avatar
Alec committed
905
CMD []