Dockerfile 10.3 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
14
COPY backends backends
15
COPY launcher launcher
Nicolas Patry's avatar
Nicolas Patry committed
16

17
18
19
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
20

Nicolas Patry's avatar
Nicolas Patry committed
21
22
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
23
24
25
26
27
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
28

29
COPY --from=planner /usr/src/recipe.json recipe.json
30
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
31

Nicolas Patry's avatar
Nicolas Patry committed
32
33
34
ARG GIT_SHA
ARG DOCKER_LABEL

35
COPY Cargo.toml Cargo.toml
36
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
37
COPY proto proto
38
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
39
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
40
COPY backends backends
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
41
COPY launcher launcher
42
RUN cargo build --profile release-opt
Nicolas Patry's avatar
Nicolas Patry committed
43
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
44

45
46
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
Nicolas Patry's avatar
Nicolas Patry committed
47
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
48

49
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
50
ARG PYTORCH_VERSION=2.4.0
51

Nicolas Patry's avatar
Nicolas Patry committed
52
ARG PYTHON_VERSION=3.11
53
# Keep in sync with `server/pyproject.toml
Nicolas Patry's avatar
Nicolas Patry committed
54
ARG CUDA_VERSION=12.4
drbh's avatar
drbh committed
55
ARG MAMBA_VERSION=24.3.0-0
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
87
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
88
89
90
91
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
92
FROM pytorch-install AS kernel-builder
93

94
ARG MAX_JOBS=8
95
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
96

97
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
98
        ninja-build cmake \
99
100
101
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
102
FROM kernel-builder AS flash-att-builder
103
104
105
106
107
108
109
110

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

111
# Build Flash Attention v2 CUDA kernels
112
FROM kernel-builder AS flash-att-v2-builder
113
114
115
116
117
118

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
119
RUN make build-flash-attention-v2-cuda
120

121
# Build Transformers exllama kernels
122
FROM kernel-builder AS exllama-kernels-builder
123
124
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
125

126
RUN python setup.py build
Nicolas Patry's avatar
Nicolas Patry committed
127
128

# Build Transformers exllama kernels
129
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
130
WORKDIR /usr/src
Nicolas Patry's avatar
Nicolas Patry committed
131
COPY server/Makefile-exllamav2/ Makefile
Nicolas Patry's avatar
Nicolas Patry committed
132

133
# Build specific version of transformers
134
RUN make build-exllamav2
135

136
# Build Transformers awq kernels
137
FROM kernel-builder AS awq-kernels-builder
138
139
140
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
141
RUN make build-awq
142

143
# Build eetq kernels
144
FROM kernel-builder AS eetq-kernels-builder
145
146
147
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
148
RUN make build-eetq
149

drbh's avatar
drbh committed
150
# Build Lorax Punica kernels
151
FROM kernel-builder AS lorax-punica-builder
drbh's avatar
drbh committed
152
153
154
155
156
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

157
# Build Transformers CUDA kernels
158
FROM kernel-builder AS custom-kernels-builder
159
WORKDIR /usr/src
160
COPY server/custom_kernels/ .
161
# Build specific version of transformers
162
RUN python setup.py build
163

164
165
166
167
168
169
170
171
172
# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile

RUN make build-fbgemm

173
# Build vllm CUDA kernels
174
FROM kernel-builder AS vllm-builder
175
176
177

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
178
179
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

180
181
182
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
183
RUN make build-vllm-cuda
184

drbh's avatar
drbh committed
185
# Build mamba kernels
186
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
187
188
189
190
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

191
192
193
194
195
196
# Build flashinfer
FROM kernel-builder AS flashinfer-builder
WORKDIR /usr/src
COPY server/Makefile-flashinfer Makefile
RUN make install-flashinfer

197
# Text Generation Inference base image
198
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
199
200
201
202
203
204

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
205
ENV HF_HOME=/data \
206
    HF_HUB_ENABLE_HF_TRANSFER=1 \
207
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
208

209
WORKDIR /usr/src
210

211
212
213
214
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
215
        curl \
drbh's avatar
drbh committed
216
        git \
217
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
218

219
220
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
221

222
# Copy build artifacts from flash attention builder
Nicolas Patry's avatar
Nicolas Patry committed
223
224
225
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
226

227
# Copy build artifacts from flash attention v2 builder
Nicolas Patry's avatar
Nicolas Patry committed
228
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
229

230
# Copy build artifacts from custom kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
231
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
232
# Copy build artifacts from exllama kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
233
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
234
# Copy build artifacts from exllamav2 kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
235
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
236
# Copy build artifacts from awq kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
237
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
238
# Copy build artifacts from eetq kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
239
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
240
# Copy build artifacts from lorax punica kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
241
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
242
# Copy build artifacts from fbgemm builder
Nicolas Patry's avatar
Nicolas Patry committed
243
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
244
# Copy build artifacts from vllm builder
Nicolas Patry's avatar
Nicolas Patry committed
245
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
drbh's avatar
drbh committed
246
# Copy build artifacts from mamba builder
Nicolas Patry's avatar
Nicolas Patry committed
247
248
249
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
drbh's avatar
drbh committed
250

OlivierDehaene's avatar
OlivierDehaene committed
251
# Install flash-attention dependencies
252
RUN pip install einops --no-cache-dir
253

Olivier Dehaene's avatar
Olivier Dehaene committed
254
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
255
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
256
COPY server server
257
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
258
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
259
    make gen-server && \
fxmarty's avatar
fxmarty committed
260
    pip install -r requirements_cuda.txt && \
261
    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
OlivierDehaene's avatar
OlivierDehaene committed
262
263
    pip install nvidia-nccl-cu12==2.22.3

Nicolas Patry's avatar
Nicolas Patry committed
264
265
266
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
# Required to find libpython within the rust binaries
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
267
268
269
# This is needed because exl2 tries to load flash-attn
# And fails with our builds.
ENV EXLLAMA_NO_FLASH_ATTN=1
Olivier Dehaene's avatar
Olivier Dehaene committed
270

Nicolas Patry's avatar
Nicolas Patry committed
271
272
273
274
275
276
277
278
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

279
# Install benchmarker
280
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
281
# Install router
282
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
283
# Install launcher
284
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
285

286

fxmarty's avatar
fxmarty committed
287
# AWS Sagemaker compatible image
288
FROM base AS sagemaker
289
290
291
292
293
294

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

295
# Final image
296
297
FROM base

oOraph's avatar
oOraph committed
298
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
299
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
300
301

ENTRYPOINT ["/tgi-entrypoint.sh"]
drbh's avatar
drbh committed
302
# CMD ["--json-output"]