Dockerfile_amd 6.65 KB
Newer Older
fxmarty's avatar
fxmarty committed
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
fxmarty's avatar
fxmarty committed
3
4
5
6
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
fxmarty's avatar
fxmarty committed
9
10
11
12
13
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
14
COPY backends backends
fxmarty's avatar
fxmarty committed
15
16
17
18
19
20
21
22
23
24
25
26
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
27
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
fxmarty's avatar
fxmarty committed
28

Nicolas Patry's avatar
Nicolas Patry committed
29
30
31
ARG GIT_SHA
ARG DOCKER_LABEL

fxmarty's avatar
fxmarty committed
32
33
34
35
36
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
37
COPY backends backends
fxmarty's avatar
fxmarty committed
38
COPY launcher launcher
39
RUN cargo build --profile release-opt
fxmarty's avatar
fxmarty committed
40
41

# Text Generation Inference base image for RoCm
42
FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update AS base
fxmarty's avatar
fxmarty committed
43
44
45
46
47
48
49
50
51
52
53
54
55

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    build-essential \
    ca-certificates \
    ccache \
    curl \
    git \
    make \
    libssl-dev \
    g++ \
    # Needed to build VLLM & flash.
    rocthrust-dev \
    hipsparse-dev \
fxmarty's avatar
fxmarty committed
56
57
58
59
60
61
62
63
64
65
66
67
    hipblas-dev \
    hipblaslt-dev \
    rocblas-dev \
    hiprand-dev \
    rocrand-dev \
    miopen-hip-dev \
    hipfft-dev \
    hipcub-dev \
    hipsolver-dev \
    rccl-dev \
    cmake \
    python3-dev && \
fxmarty's avatar
fxmarty committed
68
69
70
71
    rm -rf /var/lib/apt/lists/*

# Keep in sync with `server/pyproject.toml
ARG MAMBA_VERSION=23.1.0-1
fxmarty's avatar
fxmarty committed
72
73
ARG PYTORCH_VERSION='2.3.0'
ARG ROCM_VERSION='6.0.2'
fxmarty's avatar
fxmarty committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
ARG PYTHON_VERSION='3.10.10'
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH

# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
# Install mamba
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    mamba init && \
    rm ~/mambaforge.sh

fxmarty's avatar
fxmarty committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Install flash-attention, torch dependencies
RUN pip install numpy einops ninja --no-cache-dir

RUN conda install intel::mkl-static intel::mkl-include
RUN pip uninstall -y triton && \
    git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
    cd triton/python && \
    pip install .

RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir

ARG _GLIBCXX_USE_CXX11_ABI="1"
ARG CMAKE_PREFIX_PATH="/opt/conda"
ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ARG BUILD_CAFFE2="0" \
    BUILD_CAFFE2_OPS="0" \
    USE_CUDA="0" \
    USE_ROCM="1" \
    BUILD_TEST="0" \
    USE_FBGEMM="0" \
    USE_NNPACK="0" \
    USE_QNNPACK="0" \
    USE_XNNPACK="0" \
    USE_FLASH_ATTENTION="1" \
    USE_MEM_EFF_ATTENTION="0"

RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install

120
# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
fxmarty's avatar
fxmarty committed
121
122
ENV HIP_FORCE_DEV_KERNARG=1

123
124
125
# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
# However, Triton requires a tunning for each prompt length, which is prohibitive.
ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
fxmarty's avatar
fxmarty committed
126
127
128

FROM base AS kernel-builder

fxmarty's avatar
fxmarty committed
129
# # Build vllm kernels
fxmarty's avatar
fxmarty committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
FROM kernel-builder AS vllm-builder
WORKDIR /usr/src

COPY server/Makefile-vllm Makefile

# Build specific version of vllm
RUN make build-vllm-rocm

# Build Flash Attention v2 kernels
FROM kernel-builder AS flash-att-v2-builder
WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
RUN make build-flash-attention-v2-rocm

# Build Transformers CUDA kernels (gpt-neox and bloom)
148
FROM kernel-builder AS custom-kernels-builder
fxmarty's avatar
fxmarty committed
149
150
WORKDIR /usr/src
COPY server/custom_kernels/ .
fxmarty's avatar
fxmarty committed
151
RUN python setup.py build
fxmarty's avatar
fxmarty committed
152

fxmarty's avatar
fxmarty committed
153
# Build exllama kernels
154
FROM kernel-builder AS exllama-kernels-builder
fxmarty's avatar
fxmarty committed
155
156
157
WORKDIR /usr/src
COPY server/exllama_kernels/ .

fxmarty's avatar
fxmarty committed
158
RUN python setup.py build
fxmarty's avatar
fxmarty committed
159
160

# Build exllama v2 kernels
161
FROM kernel-builder AS exllamav2-kernels-builder
fxmarty's avatar
fxmarty committed
162
163
164
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

fxmarty's avatar
fxmarty committed
165
RUN python setup.py build
fxmarty's avatar
fxmarty committed
166

167
FROM base AS base-copy
fxmarty's avatar
fxmarty committed
168
169

# Text Generation Inference base env
170
ENV HF_HOME=/data \
fxmarty's avatar
fxmarty committed
171
172
173
174
175
176
177
178
179
180
181
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
fxmarty's avatar
fxmarty committed
182
183
184
185
186
187

# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
fxmarty's avatar
fxmarty committed
188
189
190
191
192
193
194
195

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
OlivierDehaene's avatar
OlivierDehaene committed
196
    pip install ".[accelerate, peft, outlines]" --no-cache-dir
fxmarty's avatar
fxmarty committed
197
198

# Install benchmarker
199
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
fxmarty's avatar
fxmarty committed
200
# Install router
201
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
fxmarty's avatar
fxmarty committed
202
# Install launcher
203
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
fxmarty's avatar
fxmarty committed
204
205

# AWS Sagemaker compatible image
206
FROM base AS sagemaker
fxmarty's avatar
fxmarty committed
207

fxmarty's avatar
fxmarty committed
208
209
210
211
212
213
214
215
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base-copy

fxmarty's avatar
fxmarty committed
216
217
218
219
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]
fxmarty's avatar
fxmarty committed
220
CMD ["--json-output"]