Dockerfile 9.71 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
14
COPY backends backends
15
16
17
18
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
19
20
21
22
23
24

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
25

26
COPY --from=planner /usr/src/recipe.json recipe.json
27
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
28

Nicolas Patry's avatar
Nicolas Patry committed
29
30
31
ARG GIT_SHA
ARG DOCKER_LABEL

32
COPY Cargo.toml Cargo.toml
33
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
34
COPY proto proto
35
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
36
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
37
COPY backends backends
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
38
COPY launcher launcher
39
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
40

41
42
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
Nicolas Patry's avatar
Nicolas Patry committed
43
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
44

45
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
46
ARG PYTORCH_VERSION=2.4.0
47

48
ARG PYTHON_VERSION=3.10
49
# Keep in sync with `server/pyproject.toml
Nicolas Patry's avatar
Nicolas Patry committed
50
ARG CUDA_VERSION=12.4
drbh's avatar
drbh committed
51
ARG MAMBA_VERSION=24.3.0-0
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
83
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
84
85
86
87
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
88
FROM pytorch-install AS kernel-builder
89

90
ARG MAX_JOBS=8
91
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
92

93
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
94
        ninja-build cmake \
95
96
97
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
98
FROM kernel-builder AS flash-att-builder
99
100
101
102
103
104
105
106

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

107
# Build Flash Attention v2 CUDA kernels
108
FROM kernel-builder AS flash-att-v2-builder
109
110
111
112
113
114

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
115
RUN make build-flash-attention-v2-cuda
116

117
# Build Transformers exllama kernels
118
FROM kernel-builder AS exllama-kernels-builder
119
120
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
121

122
RUN python setup.py build
Nicolas Patry's avatar
Nicolas Patry committed
123
124

# Build Transformers exllama kernels
125
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
126
WORKDIR /usr/src
Nicolas Patry's avatar
Nicolas Patry committed
127
COPY server/Makefile-exllamav2/ Makefile
Nicolas Patry's avatar
Nicolas Patry committed
128

129
# Build specific version of transformers
130
RUN make build-exllamav2
131

132
# Build Transformers awq kernels
133
FROM kernel-builder AS awq-kernels-builder
134
135
136
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
137
RUN make build-awq
138

139
# Build eetq kernels
140
FROM kernel-builder AS eetq-kernels-builder
141
142
143
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
144
RUN make build-eetq
145

drbh's avatar
drbh committed
146
# Build Lorax Punica kernels
147
FROM kernel-builder AS lorax-punica-builder
drbh's avatar
drbh committed
148
149
150
151
152
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

153
# Build Transformers CUDA kernels
154
FROM kernel-builder AS custom-kernels-builder
155
WORKDIR /usr/src
156
COPY server/custom_kernels/ .
157
# Build specific version of transformers
158
RUN python setup.py build
159

160
161
162
163
164
165
166
167
168
# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile

RUN make build-fbgemm

169
# Build vllm CUDA kernels
170
FROM kernel-builder AS vllm-builder
171
172
173

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
174
175
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

176
177
178
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
179
RUN make build-vllm-cuda
180

drbh's avatar
drbh committed
181
# Build mamba kernels
182
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
183
184
185
186
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

187
# Text Generation Inference base image
188
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
189
190
191
192
193
194

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
195
ENV HF_HOME=/data \
196
    HF_HUB_ENABLE_HF_TRANSFER=1 \
197
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
198

199
WORKDIR /usr/src
200

201
202
203
204
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
205
        curl \
drbh's avatar
drbh committed
206
        git \
207
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
208

209
210
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
211

212
# Copy build artifacts from flash attention builder
213
214
215
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
216

217
# Copy build artifacts from flash attention v2 builder
218
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
219

220
# Copy build artifacts from custom kernels builder
221
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
222
# Copy build artifacts from exllama kernels builder
223
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
224
# Copy build artifacts from exllamav2 kernels builder
Nicolas Patry's avatar
Nicolas Patry committed
225
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
226
# Copy build artifacts from awq kernels builder
227
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
228
# Copy build artifacts from eetq kernels builder
229
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
230
231
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
232
233
234
# Copy build artifacts from fbgemm builder
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from vllm builder
235
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
drbh's avatar
drbh committed
236
237
238
239
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

OlivierDehaene's avatar
OlivierDehaene committed
240
# Install flash-attention dependencies
241
RUN pip install einops --no-cache-dir
242

Olivier Dehaene's avatar
Olivier Dehaene committed
243
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
244
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
245
COPY server server
246
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
247
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
248
    make gen-server && \
fxmarty's avatar
fxmarty committed
249
    pip install -r requirements_cuda.txt && \
250
    pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
OlivierDehaene's avatar
OlivierDehaene committed
251
252
253
    pip install nvidia-nccl-cu12==2.22.3

ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
254
255
256
# This is needed because exl2 tries to load flash-attn
# And fails with our builds.
ENV EXLLAMA_NO_FLASH_ATTN=1
Olivier Dehaene's avatar
Olivier Dehaene committed
257

Nicolas Patry's avatar
Nicolas Patry committed
258
259
260
261
262
263
264
265
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

266
# Install benchmarker
267
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
268
# Install router
269
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
270
# Install launcher
271
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
272

273

fxmarty's avatar
fxmarty committed
274
# AWS Sagemaker compatible image
275
FROM base AS sagemaker
276
277
278
279
280
281

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

282
# Final image
283
284
FROM base

oOraph's avatar
oOraph committed
285
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
286
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
287
288

ENTRYPOINT ["/tgi-entrypoint.sh"]
drbh's avatar
drbh committed
289
# CMD ["--json-output"]