Dockerfile 10.2 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
14
COPY backends backends
15
COPY launcher launcher
Nicolas Patry's avatar
Nicolas Patry committed
16

17
18
19
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
20

Nicolas Patry's avatar
Nicolas Patry committed
21
22
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
23
24
25
26
27
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
28

29
COPY --from=planner /usr/src/recipe.json recipe.json
30
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
31

Nicolas Patry's avatar
Nicolas Patry committed
32
33
34
ARG GIT_SHA
ARG DOCKER_LABEL

35
COPY Cargo.toml Cargo.toml
36
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
37
COPY proto proto
38
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
39
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
40
COPY backends backends
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
41
COPY launcher launcher
42
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
43

44
45
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
Nicolas Patry's avatar
Nicolas Patry committed
46
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
47

48
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
49
ARG PYTORCH_VERSION=2.4.0
50

Nicolas Patry's avatar
Nicolas Patry committed
51
ARG PYTHON_VERSION=3.11
52
# Keep in sync with `server/pyproject.toml
Nicolas Patry's avatar
Nicolas Patry committed
53
ARG CUDA_VERSION=12.4
drbh's avatar
drbh committed
54
ARG MAMBA_VERSION=24.3.0-0
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
86
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
87
88
89
90
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
91
FROM pytorch-install AS kernel-builder
92

93
ARG MAX_JOBS=8
94
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
95

96
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
97
        ninja-build cmake \
98
99
100
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
101
FROM kernel-builder AS flash-att-builder
102
103
104
105
106
107
108
109

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

110
# Build Flash Attention v2 CUDA kernels
111
FROM kernel-builder AS flash-att-v2-builder
112
113
114
115
116
117

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
118
RUN make build-flash-attention-v2-cuda
119

120
# Build Transformers exllama kernels
121
FROM kernel-builder AS exllama-kernels-builder
122
123
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
124

125
RUN python setup.py build
Nicolas Patry's avatar
Nicolas Patry committed
126
127

# Build Transformers exllama kernels
128
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
129
WORKDIR /usr/src
Nicolas Patry's avatar
Nicolas Patry committed
130
COPY server/Makefile-exllamav2/ Makefile
Nicolas Patry's avatar
Nicolas Patry committed
131

132
# Build specific version of transformers
133
RUN make build-exllamav2
134

135
# Build Transformers awq kernels
136
FROM kernel-builder AS awq-kernels-builder
137
138
139
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
140
RUN make build-awq
141

142
# Build eetq kernels
143
FROM kernel-builder AS eetq-kernels-builder
144
145
146
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
147
RUN make build-eetq
148

drbh's avatar
drbh committed
149
# Build Lorax Punica kernels
150
FROM kernel-builder AS lorax-punica-builder
drbh's avatar
drbh committed
151
152
153
154
155
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

156
# Build Transformers CUDA kernels
157
FROM kernel-builder AS custom-kernels-builder
158
WORKDIR /usr/src
159
COPY server/custom_kernels/ .
160
# Build specific version of transformers
161
RUN python setup.py build
162

163
164
165
166
167
168
169
170
171
# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile

RUN make build-fbgemm

172
# Build vllm CUDA kernels
173
FROM kernel-builder AS vllm-builder
174
175
176

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
177
178
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

179
180
181
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
182
RUN make build-vllm-cuda
183

drbh's avatar
drbh committed
184
# Build mamba kernels
185
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
186
187
188
189
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

190
191
192
193
194
195
# Build flashinfer
FROM kernel-builder AS flashinfer-builder
WORKDIR /usr/src
COPY server/Makefile-flashinfer Makefile
RUN make install-flashinfer

196
# Text Generation Inference base image
197
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
198
199
200
201
202
203

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
204
ENV HF_HOME=/data \
205
    HF_HUB_ENABLE_HF_TRANSFER=1 \
206
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
207

208
WORKDIR /usr/src
209

210
211
212
213
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
214
        curl \
drbh's avatar
drbh committed
215
        git \
216
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
217

218
219
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
220

221
# Copy build artifacts from flash attention builder
Nicolas Patry's avatar
Nicolas Patry committed
222
223
224
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
225

226
# Copy build artifacts from flash attention v2 builder
Nicolas Patry's avatar
Nicolas Patry committed
227
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
228

229
# Copy build artifacts from custom kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
230
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
231
# Copy build artifacts from exllama kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
232
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
233
# Copy build artifacts from exllamav2 kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
234
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
235
# Copy build artifacts from awq kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
236
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
237
# Copy build artifacts from eetq kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
238
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
239
# Copy build artifacts from lorax punica kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
240
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
241
# Copy build artifacts from fbgemm builder
Nicolas Patry's avatar
Nicolas Patry committed
242
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
243
# Copy build artifacts from vllm builder
Nicolas Patry's avatar
Nicolas Patry committed
244
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
drbh's avatar
drbh committed
245
# Copy build artifacts from mamba builder
Nicolas Patry's avatar
Nicolas Patry committed
246
247
248
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
drbh's avatar
drbh committed
249

OlivierDehaene's avatar
OlivierDehaene committed
250
# Install flash-attention dependencies
251
RUN pip install einops --no-cache-dir
252

Olivier Dehaene's avatar
Olivier Dehaene committed
253
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
254
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
255
COPY server server
256
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
257
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
258
    make gen-server && \
fxmarty's avatar
fxmarty committed
259
    pip install -r requirements_cuda.txt && \
260
    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
OlivierDehaene's avatar
OlivierDehaene committed
261
262
    pip install nvidia-nccl-cu12==2.22.3

Nicolas Patry's avatar
Nicolas Patry committed
263
264
265
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
# Required to find libpython within the rust binaries
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
266
267
268
# This is needed because exl2 tries to load flash-attn
# And fails with our builds.
ENV EXLLAMA_NO_FLASH_ATTN=1
Olivier Dehaene's avatar
Olivier Dehaene committed
269

Nicolas Patry's avatar
Nicolas Patry committed
270
271
272
273
274
275
276
277
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

278
# Install benchmarker
279
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
280
# Install router
281
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
282
# Install launcher
283
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
284

285

fxmarty's avatar
fxmarty committed
286
# AWS Sagemaker compatible image
287
FROM base AS sagemaker
288
289
290
291
292
293

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

294
# Final image
295
296
FROM base

oOraph's avatar
oOraph committed
297
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
298
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
299
300

ENTRYPOINT ["/tgi-entrypoint.sh"]
drbh's avatar
drbh committed
301
# CMD ["--json-output"]