Dockerfile 9.99 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
14
COPY backends backends
15
16
17
18
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
19
20
21
22
23
24

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
25

26
COPY --from=planner /usr/src/recipe.json recipe.json
27
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
28

Nicolas Patry's avatar
Nicolas Patry committed
29
30
31
ARG GIT_SHA
ARG DOCKER_LABEL

32
COPY Cargo.toml Cargo.toml
33
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
34
COPY proto proto
35
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
36
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
37
COPY backends backends
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
38
COPY launcher launcher
39
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
40

41
42
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
Nicolas Patry's avatar
Nicolas Patry committed
43
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
44

45
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
46
ARG PYTORCH_VERSION=2.4.0
47

48
ARG PYTHON_VERSION=3.10
49
# Keep in sync with `server/pyproject.toml
Nicolas Patry's avatar
Nicolas Patry committed
50
ARG CUDA_VERSION=12.4
drbh's avatar
drbh committed
51
ARG MAMBA_VERSION=24.3.0-0
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
83
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
84
85
86
87
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
88
FROM pytorch-install AS kernel-builder
89

90
ARG MAX_JOBS=8
91
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
92

93
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
94
        ninja-build cmake \
95
96
97
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
98
FROM kernel-builder AS flash-att-builder
99
100
101
102
103
104
105
106

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

107
# Build Flash Attention v2 CUDA kernels
108
FROM kernel-builder AS flash-att-v2-builder
109
110
111
112
113
114

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
115
RUN make build-flash-attention-v2-cuda
116

117
# Build Transformers exllama kernels
118
FROM kernel-builder AS exllama-kernels-builder
119
120
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
121

122
RUN python setup.py build
Nicolas Patry's avatar
Nicolas Patry committed
123
124

# Build Transformers exllama kernels
125
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
126
WORKDIR /usr/src
Nicolas Patry's avatar
Nicolas Patry committed
127
COPY server/Makefile-exllamav2/ Makefile
Nicolas Patry's avatar
Nicolas Patry committed
128

129
# Build specific version of transformers
130
RUN make build-exllamav2
131

132
# Build Transformers awq kernels
133
FROM kernel-builder AS awq-kernels-builder
134
135
136
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
137
RUN make build-awq
138

139
# Build eetq kernels
140
FROM kernel-builder AS eetq-kernels-builder
141
142
143
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
144
RUN make build-eetq
145

drbh's avatar
drbh committed
146
# Build Lorax Punica kernels
147
FROM kernel-builder AS lorax-punica-builder
drbh's avatar
drbh committed
148
149
150
151
152
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

153
# Build Transformers CUDA kernels
154
FROM kernel-builder AS custom-kernels-builder
155
WORKDIR /usr/src
156
COPY server/custom_kernels/ .
157
# Build specific version of transformers
158
RUN python setup.py build
159

160
161
162
163
164
165
166
167
168
# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile

RUN make build-fbgemm

169
# Build vllm CUDA kernels
170
FROM kernel-builder AS vllm-builder
171
172
173

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
174
175
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

176
177
178
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
179
RUN make build-vllm-cuda
180

drbh's avatar
drbh committed
181
# Build mamba kernels
182
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
183
184
185
186
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

187
188
189
190
191
192
# Build flashinfer
FROM kernel-builder AS flashinfer-builder
WORKDIR /usr/src
COPY server/Makefile-flashinfer Makefile
RUN make install-flashinfer

193
# Text Generation Inference base image
194
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
195
196
197
198
199
200

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
201
ENV HF_HOME=/data \
202
    HF_HUB_ENABLE_HF_TRANSFER=1 \
203
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
204

205
WORKDIR /usr/src
206

207
208
209
210
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
211
        curl \
drbh's avatar
drbh committed
212
        git \
213
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
214

215
216
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
217

218
# Copy build artifacts from flash attention builder
219
220
221
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
222

223
# Copy build artifacts from flash attention v2 builder
224
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
225

226
# Copy build artifacts from custom kernels builder
227
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
228
# Copy build artifacts from exllama kernels builder
229
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
230
# Copy build artifacts from exllamav2 kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
231
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
232
# Copy build artifacts from awq kernels builder
233
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
234
# Copy build artifacts from eetq kernels builder
235
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
236
237
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
238
239
240
# Copy build artifacts from fbgemm builder
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from vllm builder
241
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
drbh's avatar
drbh committed
242
243
244
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
245
COPY --from=flashinfer-builder /opt/conda/lib/python3.10/site-packages/flashinfer/ /opt/conda/lib/python3.10/site-packages/flashinfer/
drbh's avatar
drbh committed
246

OlivierDehaene's avatar
OlivierDehaene committed
247
# Install flash-attention dependencies
248
RUN pip install einops --no-cache-dir
249

Olivier Dehaene's avatar
Olivier Dehaene committed
250
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
251
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
252
COPY server server
253
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
254
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
255
    make gen-server && \
fxmarty's avatar
fxmarty committed
256
    pip install -r requirements_cuda.txt && \
257
    pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
OlivierDehaene's avatar
OlivierDehaene committed
258
259
260
    pip install nvidia-nccl-cu12==2.22.3

ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
261
262
263
# This is needed because exl2 tries to load flash-attn
# And fails with our builds.
ENV EXLLAMA_NO_FLASH_ATTN=1
Olivier Dehaene's avatar
Olivier Dehaene committed
264

Nicolas Patry's avatar
Nicolas Patry committed
265
266
267
268
269
270
271
272
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

273
# Install benchmarker
274
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
275
# Install router
276
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
277
# Install launcher
278
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
279

280

fxmarty's avatar
fxmarty committed
281
# AWS Sagemaker compatible image
282
FROM base AS sagemaker
283
284
285
286
287
288

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

289
# Final image
290
291
FROM base

oOraph's avatar
oOraph committed
292
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
293
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
294
295

ENTRYPOINT ["/tgi-entrypoint.sh"]
drbh's avatar
drbh committed
296
# CMD ["--json-output"]