Dockerfile.rocm_base 12.4 KB
Newer Older
raojy's avatar
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="89075173"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_BRANCH="v0.24.1"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="v0.1.10.post2"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
ARG MORI_BRANCH="2d02c6a9"
ARG MORI_REPO="https://github.com/ROCm/mori.git"

# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0

FROM ${BASE_IMAGE} AS base

ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950
ENV MORI_GPU_ARCHS=gfx942;gfx950

# Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1

ARG PYTHON_VERSION=3.12
ENV PYTHON_VERSION=${PYTHON_VERSION}

RUN mkdir -p /app
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
RUN apt-get update -y \
    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
    && for i in 1 2 3; do \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
    done \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*

# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && SCCACHE_ARCH="x86_64" \
        && SCCACHE_VERSION="v0.8.1" \
        && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
        && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
        && tar -xzf /tmp/sccache.tar.gz -C /tmp \
        && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
        && chmod +x /usr/bin/sccache \
        && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
        && sccache --version; \
    fi

# Setup sccache for HIP compilation via HIP_CLANG_PATH
# This creates wrapper scripts in a separate directory and points HIP to use them
# This avoids modifying the original ROCm binaries which can break detection
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
RUN if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Setting up sccache wrappers for HIP compilation..." \
        && mkdir -p /opt/sccache-wrappers \
        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
        && chmod +x /opt/sccache-wrappers/clang++ \
        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
        && chmod +x /opt/sccache-wrappers/clang \
        && echo "sccache wrappers created in /opt/sccache-wrappers"; \
    fi

# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}


###
### Triton Build
###
FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
RUN git clone ${TRITON_REPO}
RUN cd triton \
    && git checkout ${TRITON_BRANCH} \
    && if [ ! -f setup.py ]; then cd python; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && mkdir -p /app/install && cp dist/*.whl /app/install
RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
    && python3 -m build --wheel && cp dist/*.whl /app/install; fi


###
### AMD SMI Build
###
FROM base AS build_amdsmi
RUN cd /opt/rocm/share/amd_smi \
    && pip wheel . --wheel-dir=dist
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install


###
### Pytorch build
###
FROM base AS build_pytorch
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_REPO
ARG USE_SCCACHE

RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
    && pip install -r requirements.txt && git submodule update --init --recursive \
    && python3 tools/amd_build/build_amd.py \
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
           && sccache --show-stats; \
       fi \
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
       fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt \
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
       fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/audio/dist/*.whl /app/install


###
### MORI Build
###
FROM base AS build_mori
ARG MORI_BRANCH
ARG MORI_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
RUN git clone ${MORI_REPO}
RUN cd mori \
    && git checkout ${MORI_BRANCH} \
    && git submodule update --init --recursive \
    && python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install


###
### FlashAttention Build
###
FROM base AS build_fa
ARG FA_BRANCH
ARG FA_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
RUN git clone ${FA_REPO}
RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && sccache --show-stats; \
       fi \
    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install


###
### AITER Build
###
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
RUN pip install pyyaml && cd aiter \
    && if [ "$USE_SCCACHE" = "1" ]; then \
           export HIP_CLANG_PATH=/opt/sccache-wrappers \
           && sccache --show-stats; \
       fi \
    && GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install


###
### Final Build
###

# Wheel release stage - 
# only includes dependencies used by wheel release pipeline
FROM base AS debs_wheel_release
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs

# Full debs stage - includes Mori (used by Docker releases)
FROM base AS debs
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs

FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
    pip install /install/*.whl

ARG BASE_IMAGE
ARG TRITON_BRANCH
ARG TRITON_REPO
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_AUDIO_REPO
ARG FA_BRANCH
ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
ARG MORI_BRANCH
ARG MORI_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
    && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
    && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt