Dockerfile.vllm 13.3 KB
Newer Older
1
2
3
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

4
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
5
# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now
6
7
8
9
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
10
ARG RELEASE_BUILD
11
ARG ENABLE_KVBM=false
12
13
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
14

15
# Make sure to update the dependency version in pyproject.toml when updating this
Dmitry Tokarev's avatar
Dmitry Tokarev committed
16
ARG VLLM_REF="1da94e673c257373280026f75ceb4effac80e892"  # from v0.10.1.1
17
18
ARG TORCH_BACKEND="cu128"

19
20
21
22
23
# sccache configuration - inherit from base build
ARG USE_SCCACHE
ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION=""

Dmitry Tokarev's avatar
Dmitry Tokarev committed
24
25
# Match 0.10.1.1 vLLM release
# https://github.com/vllm-project/vllm/releases/tag/v0.10.1.1
26
27
# Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100:
# "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'"
28
ARG DEEPGEMM_REF="f85ec64"
29
ARG FLASHINF_REF="v0.2.11"
30

31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
#   ARCH: Used for package suffixes (e.g., amd64, arm64)
#   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
#
# Default values are for x86/amd64:
#   --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
#
# For arm64/aarch64, build with:
#   --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
#
# NOTE: There isn't an easy way to define one of these values based on the other value
# without adding if statements everywhere, so just define both as ARGs for now.
ARG ARCH=amd64
ARG ARCH_ALT=x86_64
45
# Python configuration
46
ARG PYTHON_VERSION=3.12
47

48
49
ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
########################################################
########## Framework Development Image ################
########################################################
#
# PURPOSE: Framework development and vLLM compilation
#
# This stage builds and compiles framework dependencies including:
# - vLLM inference engine with CUDA support
# - DeepGEMM and FlashInfer optimizations
# - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions
#
# Use this stage when you need to:
# - Build vLLM from source with custom modifications
# - Develop or debug framework-level components
# - Create custom builds with specific optimization flags
#
68

69
70
71
# Use dynamo base image (see /container/Dockerfile for more details)
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework

72
73
ARG PYTHON_VERSION

74
75
RUN apt-get update -y \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
76
77
78
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        build-essential \
79
80
81
82
83
84
85
86
87
88
89
        # vLLM build dependencies
        cmake \
        ibverbs-providers \
        ibverbs-utils \
        libibumad-dev \
        libibverbs-dev \
        libnuma-dev \
        librdmacm-dev \
        rdma-core \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
90
91

### VIRTUAL ENVIRONMENT SETUP ###
92
93

COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
94
95
96
# Create virtual environment
RUN mkdir -p /opt/dynamo/venv && \
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION
97
98

# Activate virtual environment
99
100
ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"
101

102
ARG ARCH
103
# Install vllm - keep this early in Dockerfile to avoid
104
# rebuilds from unrelated source code changes
105
ARG VLLM_REF
106
ARG VLLM_GIT_URL
107
108
ARG DEEPGEMM_REF
ARG FLASHINF_REF
109
ARG TORCH_BACKEND
110

111
112
ARG MAX_JOBS=16
ENV MAX_JOBS=$MAX_JOBS
113
ENV CUDA_HOME=/usr/local/cuda
114

115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Install sccache if requested
COPY container/use-sccache.sh /tmp/use-sccache.sh
# Install sccache if requested
ARG USE_SCCACHE
ARG ARCH_ALT
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION

ENV ARCH_ALT=${ARCH_ALT}
RUN if [ "$USE_SCCACHE" = "true" ]; then \
        /tmp/use-sccache.sh install; \
    fi

# Set environment variables - they'll be empty strings if USE_SCCACHE=false
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
    SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \
    SCCACHE_S3_KEY_PREFIX=${USE_SCCACHE:+${ARCH}} \
    CMAKE_C_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CXX_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache} \
    CMAKE_CUDA_COMPILER_LAUNCHER=${USE_SCCACHE:+sccache}
# Install VLLM and related dependencies
136
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
137
    --mount=type=cache,target=/root/.cache/uv \
138
139
140
141
        # TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
        # Should be able to select how you want your build to go
        cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
        chmod +x /tmp/install_vllm.sh && \
142
143
        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND && \
        /tmp/use-sccache.sh show-stats "vLLM";
144
145
146
147

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH
148

149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
##################################################
########## Runtime Image ########################
##################################################
#
# PURPOSE: Production runtime environment
#
# This stage creates a lightweight production-ready image containing:
# - Pre-compiled vLLM and framework dependencies
# - Dynamo runtime libraries and Python packages
# - Essential runtime dependencies and configurations
# - Optimized for inference workloads and deployment
#
# Use this stage when you need:
# - Production deployment of Dynamo with vLLM
# - Minimal runtime footprint without build tools
# - Ready-to-run inference server environment
# - Base for custom application containers
#

FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
170

171
172
173
174
175
176
WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"

ARG ARCH_ALT
177
ARG PYTHON_VERSION
178
179
180
181
182
183
184
185
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins

# Install Python, build-essential and python3-dev as apt dependencies
RUN apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Python runtime - CRITICAL for virtual environment to work
186
        python${PYTHON_VERSION}-dev \
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
        build-essential \
        # jq and curl for polling various endpoints and health checks
        jq \
        curl \
        # Libraries required by UCX to find RDMA devices
        libibverbs1 rdma-core ibverbs-utils libibumad3 \
        libnuma1 librdmacm1 ibverbs-providers \
        # JIT Kernel Compilation, flashinfer
        ninja-build \
        g++ \
        # prometheus dependencies
        ca-certificates && \
    rm -rf /var/lib/apt/lists/*

# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
COPY --from=framework /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=framework /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
COPY --from=framework /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas
COPY --from=framework /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
COPY --from=framework /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=framework /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=framework /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/

### COPY NATS & ETCD ###
# Copy nats and etcd from dev image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
216

217
218
219
220
221
222
223
224
# Install prometheus
ARG PROM_VERSION=3.4.1
RUN ARCH=$(dpkg --print-architecture) && \
    case "$ARCH" in \
        amd64) PLATFORM=linux-amd64 ;; \
        arm64) PLATFORM=linux-arm64 ;; \
        *) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
    esac && \
225
    curl -fsSL "https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz" \
226
    | tar -xz -C /tmp && \
227
    mv "/tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus" /usr/local/bin/ && \
228
    chmod +x /usr/local/bin/prometheus && \
229
    rm -rf "/tmp/prometheus-${PROM_VERSION}.${PLATFORM}"
230

231
232
233
234
235
# Copy UCX from dev image as plugin for NIXL
# Copy NIXL source from devr image
# Copy dynamo wheels for gitlab artifacts
COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx
COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX
236

237
238
239
240
241
242
243
244
245
246
247
248
249
# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
COPY --from=framework /opt/vllm /opt/vllm

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH

### VIRTUAL ENVIRONMENT SETUP ###

250
251
252
# Copy uv and entire virtual environment from framework container
COPY --from=framework /bin/uv /bin/uvx /bin/
COPY --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281

# Install dynamo, NIXL, and dynamo-specific dependencies
COPY benchmarks/ /opt/dynamo/benchmarks/
COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*cp312*.whl \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl \
    /opt/dynamo/benchmarks && \
    rm -rf /opt/dynamo/benchmarks

# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
    uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt

# Copy benchmarks, examples, and tests for CI
COPY . /workspace/

# Copy attribution files
COPY ATTRIBUTION* LICENSE /workspace/
# Copy launch banner
RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
    sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
    echo "cat ~/.launch_screen" >> ~/.bashrc && \
    echo "source $VIRTUAL_ENV/bin/activate" >> ~/.bashrc

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
282

283
#######################################
284
########## Local Development #######
285
#######################################
286
287
288
289
290
291
292
293
294
295
#
# PURPOSE: Local development
#
# This stage adds development tools, utilities, and dependencies specifically
# needed for:
# - Local development and debugging
# - vscode/cursor development
#
# Use this stage when you need a full development environment with additional
# tooling beyond the base runtime image.
296

297
FROM runtime AS dev
298
299
300
301
302
303
304
305
306
307

# Install utilities
RUN apt-get update -y && \
    apt-get install -y --no-install-recommends  \
    # Install utilities
    nvtop \
    wget \
    tmux \
    vim \
    git \
308
309
310
311
312
313
    openssh-client \
    iproute2 \
    rsync \
    zip \
    unzip \
    htop \
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
    # Build Dependencies
    autoconf \
    automake \
    cmake \
    libtool \
    meson \
    net-tools \
    pybind11-dev \
    # Rust build dependencies
    clang \
    libclang-dev \
    protobuf-compiler && \
    rm -rf /var/lib/apt/lists/*

# Rust environment setup
ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    CARGO_TARGET_DIR=/opt/dynamo/target \
    PATH=/usr/local/cargo/bin:$PATH

COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355

# https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user
# Will use the default ubuntu user, but give sudo access
# Needed so files permissions aren't set to root ownership when writing from inside container

# Don't want ubuntu to be editable, just change uid and gid. User ubuntu is hardcoded in .devcontainer
ENV USERNAME=ubuntu
ARG USER_UID=1000
ARG USER_GID=1000

RUN apt-get update && apt-get install -y sudo gnupg2 gnupg1 \
    && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
    && chmod 0440 /etc/sudoers.d/$USERNAME \
    && mkdir -p /home/$USERNAME \
    && chown -R $USERNAME:$USERNAME /home/$USERNAME \
    && rm -rf /var/lib/apt/lists/* \
    && chsh -s /bin/bash $USERNAME

# This is a slow operation (~40s on my cpu)
# Much better than chown -R $USERNAME:$USERNAME /opt/dynamo/venv (~10min on my cpu)
356
COPY --from=runtime --chown=$USER_UID:$USER_GID ${VIRTUAL_ENV} ${VIRTUAL_ENV}
357
RUN chown $USERNAME:$USERNAME ${VIRTUAL_ENV}
358
COPY --from=runtime --chown=$USERNAME:$USERNAME /usr/local/bin /usr/local/bin
359

360
# so we can use maturin develop
361
RUN uv pip install maturin[patchelf]
362

363
364
USER $USERNAME
ENV HOME=/home/$USERNAME
365
ENV PYTHONPATH=$PYTHONPATH:$HOME/dynamo/components/planner/src
366
ENV CARGO_TARGET_DIR=$HOME/dynamo/.build/target
367
368
369
370
371
372
373
374
375
376
WORKDIR $HOME

# https://code.visualstudio.com/remote/advancedcontainers/persist-bash-history
RUN SNIPPET="export PROMPT_COMMAND='history -a' && export HISTFILE=$HOME/.commandhistory/.bash_history" \
    && mkdir -p $HOME/.commandhistory \
    && touch $HOME/.commandhistory/.bash_history \
    && echo "$SNIPPET" >> "$HOME/.bashrc"

RUN mkdir -p /home/$USERNAME/.cache/

377
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
378
CMD []