"vscode:/vscode.git/clone" did not exist on "7ac48fd3577f35e235ef96e690c3fc9b847fd26a"
Dockerfile.rocm_base 15 KB
Newer Older
1
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
2
ARG TRITON_BRANCH="57c693b6"
3
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
4
ARG PYTORCH_BRANCH="89075173"
5
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
6
ARG PYTORCH_VISION_BRANCH="v0.24.1"
7
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
8
9
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
10
ARG FA_BRANCH="0e60e394"
11
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
12
ARG AITER_BRANCH="6af8b687"
13
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
14
15
ARG MORI_BRANCH="2d02c6a9"
ARG MORI_REPO="https://github.com/ROCm/mori.git"
16

17
18
19
20
21
22
23
# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
24

25
26
FROM ${BASE_IMAGE} AS base

27
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
28
29
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
30
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
31
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
32
ENV AITER_ROCM_ARCH=gfx942;gfx950
33
ENV MORI_GPU_ARCHS=gfx942;gfx950
34

35
36
37
# Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1

38
ARG PYTHON_VERSION=3.12
39
ENV PYTHON_VERSION=${PYTHON_VERSION}
40
41
42
43
44
45
46

RUN mkdir -p /app
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
RUN apt-get update -y \
47
    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
48
49
50
51
    && for i in 1 2 3; do \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
    done \
52
53
54
55
56
57
58
59
60
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

61
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
62
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
63

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && SCCACHE_ARCH="x86_64" \
        && SCCACHE_VERSION="v0.8.1" \
        && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
        && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
        && tar -xzf /tmp/sccache.tar.gz -C /tmp \
        && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
        && chmod +x /usr/bin/sccache \
        && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
        && sccache --version; \
    fi

# Setup sccache for HIP compilation via HIP_CLANG_PATH
# This creates wrapper scripts in a separate directory and points HIP to use them
# This avoids modifying the original ROCm binaries which can break detection
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
RUN if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Setting up sccache wrappers for HIP compilation..." \
        && mkdir -p /opt/sccache-wrappers \
        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
        && chmod +x /opt/sccache-wrappers/clang++ \
        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
        && chmod +x /opt/sccache-wrappers/clang \
        && echo "sccache wrappers created in /opt/sccache-wrappers"; \
    fi

# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}

107
108
109
110

###
### Triton Build
###
111
112
113
114
115
116
FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
RUN git clone ${TRITON_REPO}
RUN cd triton \
    && git checkout ${TRITON_BRANCH} \
117
118
119
120
121
    && if [ ! -f setup.py ]; then cd python; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && mkdir -p /app/install && cp dist/*.whl /app/install
RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
    && python3 -m build --wheel && cp dist/*.whl /app/install; fi
122

123
124
125
126

###
### AMD SMI Build
###
127
128
129
130
131
FROM base AS build_amdsmi
RUN cd /opt/rocm/share/amd_smi \
    && pip wheel . --wheel-dir=dist
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install

132
133
134
135

###
### Pytorch build
###
136
137
138
FROM base AS build_pytorch
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
139
ARG PYTORCH_AUDIO_BRANCH
140
141
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
142
ARG PYTORCH_AUDIO_REPO
143
ARG USE_SCCACHE
144

145
RUN git clone ${PYTORCH_REPO} pytorch
146
147
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
    && pip install -r requirements.txt && git submodule update --init --recursive \
148
    && python3 tools/amd_build/build_amd.py \
149
150
151
152
153
154
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
           && sccache --show-stats; \
       fi \
155
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
156
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
157
158
159
    && pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
160
161
162
163
164
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
       fi \
165
    && python3 setup.py bdist_wheel --dist-dir=dist \
166
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
167
    && pip install dist/*.whl
168
169
170
171
RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt \
172
173
174
175
176
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
       fi \
177
    && python3 setup.py bdist_wheel --dist-dir=dist \
178
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
179
    && pip install dist/*.whl
180
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
181
182
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/audio/dist/*.whl /app/install
183

184

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
###
### MORI Build
###
FROM base AS build_mori
ARG MORI_BRANCH
ARG MORI_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
RUN git clone ${MORI_REPO}
RUN cd mori \
    && git checkout ${MORI_BRANCH} \
    && git submodule update --init --recursive \
    && python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install


201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
###
### RIXL Build
###
FROM build_pytorch AS build_rixl
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO

ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench

# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
    libgrpc-dev \
    libgrpc++-dev \
    libprotobuf-dev \
    protobuf-compiler-grpc \
    libcpprest-dev \
    libaio-dev \
    librdmacm1 \
    librdmacm-dev \
    libibverbs1 \
    libibverbs-dev \
    ibverbs-utils \
    rdmacm-utils \
    ibverbs-providers

RUN pip install meson auditwheel patchelf tomlkit

WORKDIR /workspace

RUN git clone ${ETCD_REPO} && \
    cd etcd-cpp-apiv3 && \
    git checkout ${ETCD_BRANCH} && \
    mkdir build && cd build && \
    cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
    make -j$(nproc) && \
    make install

RUN cd /usr/local/src && \
    git clone ${UCX_REPO} &&  \
    cd ucx  && \
    git checkout ${UCX_BRANCH} && \
    ./autogen.sh && \
    mkdir build && cd build && \
    ../configure \
        --prefix=/usr/local/ucx \
        --enable-shared \
        --disable-static \
        --disable-doxygen-doc \
        --enable-optimizations \
        --enable-devel-headers \
        --with-rocm=/opt/rocm \
        --with-verbs \
        --with-dm \
        --enable-mt && \
    make -j && \
    make -j install

ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}

RUN git clone ${RIXL_REPO} /opt/rixl && \
    cd /opt/rixl && \
    git checkout ${RIXL_BRANCH} && \
    meson setup build --prefix=${RIXL_HOME} \
                     -Ducx_path=${UCX_HOME} \
                     -Drocm_path=${ROCM_PATH} && \
    cd build && \
    ninja && \
    ninja install

# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
    ./contrib/build-wheel.sh \
        --output-dir /app/install \
        --rocm-dir ${ROCM_PATH} \
        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins


###
### FlashAttention Build
###
290
291
292
FROM base AS build_fa
ARG FA_BRANCH
ARG FA_REPO
293
ARG USE_SCCACHE
294
295
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
296
297
298
299
RUN git clone ${FA_REPO}
RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
300
301
302
303
304
305
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && sccache --show-stats; \
       fi \
    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
306
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
307

308
309
310
311

###
### AITER Build
###
312
313
314
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
315
ARG USE_SCCACHE
316
317
318
319
320
321
322
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
323
324
325
326
327
328
329
330
RUN pip install pyyaml && cd aiter \
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && sccache --show-stats; \
       fi \
    && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && ls /app/aiter/dist/*.whl
331
332
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install

333
334
335
336

###
### Final Build
###
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353

# Wheel release stage - 
# only includes dependencies used by wheel release pipeline
FROM base AS debs_wheel_release
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs

# Full debs stage - includes Mori (used by Docker releases)
354
355
356
357
FROM base AS debs
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
358
359
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
360
361
362
363
364
365
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
366
367
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
368
369
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
370

371
FROM base AS final
372
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
373
    pip install /install/*.whl
374

375
376
377
378
379
380
381
ARG BASE_IMAGE
ARG TRITON_BRANCH
ARG TRITON_REPO
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
382
383
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_AUDIO_REPO
384
385
ARG FA_BRANCH
ARG FA_REPO
386
387
ARG AITER_BRANCH
ARG AITER_REPO
388
389
390
391
392
393
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
394
395
ARG MORI_BRANCH
ARG MORI_REPO
396
397
398
399
400
401
402
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
403
404
    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
405
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
406
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
407
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
408
409
410
411
412
413
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
    && echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
    && echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
    && echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
    && echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
    && echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
414
415
416
    && echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
    && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
    && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt