Dockerfile 9.48 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
14
COPY backends backends
15
16
17
18
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
19
20
21
22
23
24

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
25

26
COPY --from=planner /usr/src/recipe.json recipe.json
27
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
28

Nicolas Patry's avatar
Nicolas Patry committed
29
30
31
ARG GIT_SHA
ARG DOCKER_LABEL

32
COPY Cargo.toml Cargo.toml
33
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
34
COPY proto proto
35
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
36
COPY router router
Nicolas Patry's avatar
Nicolas Patry committed
37
COPY backends backends
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
38
COPY launcher launcher
39
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
40

41
42
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
43
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
44

45
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
46
ARG PYTORCH_VERSION=2.4.0
47

48
ARG PYTHON_VERSION=3.10
49
# Keep in sync with `server/pyproject.toml
50
ARG CUDA_VERSION=12.1
drbh's avatar
drbh committed
51
ARG MAMBA_VERSION=24.3.0-0
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
83
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
84
85
86
87
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
88
FROM pytorch-install AS kernel-builder
89

90
91
ARG MAX_JOBS=8

92
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
93
        ninja-build cmake \
94
95
96
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
97
FROM kernel-builder AS flash-att-builder
98
99
100
101
102
103
104
105

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

106
# Build Flash Attention v2 CUDA kernels
107
FROM kernel-builder AS flash-att-v2-builder
108
109
110
111
112
113

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
114
RUN make build-flash-attention-v2-cuda
115

116
# Build Transformers exllama kernels
117
FROM kernel-builder AS exllama-kernels-builder
118
119
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
120
121
122
123

RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
124
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
125
126
127
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

128
129
130
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

131
# Build Transformers awq kernels
132
FROM kernel-builder AS awq-kernels-builder
133
134
135
136
137
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

138
# Build eetq kernels
139
FROM kernel-builder AS eetq-kernels-builder
140
141
142
143
144
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

drbh's avatar
drbh committed
145
# Build Lorax Punica kernels
146
FROM kernel-builder AS lorax-punica-builder
drbh's avatar
drbh committed
147
148
149
150
151
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

152
# Build Transformers CUDA kernels
153
FROM kernel-builder AS custom-kernels-builder
154
WORKDIR /usr/src
155
COPY server/custom_kernels/ .
156
# Build specific version of transformers
157
RUN python setup.py build
158

159
160
161
162
163
164
165
166
167
# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile

RUN make build-fbgemm

168
# Build vllm CUDA kernels
169
FROM kernel-builder AS vllm-builder
170
171
172

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
173
174
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

175
176
177
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
178
RUN make build-vllm-cuda
179

drbh's avatar
drbh committed
180
# Build mamba kernels
181
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
182
183
184
185
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

186
# Text Generation Inference base image
187
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
188
189
190
191
192
193

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
194
ENV HF_HOME=/data \
195
    HF_HUB_ENABLE_HF_TRANSFER=1 \
196
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
197

198
WORKDIR /usr/src
199

200
201
202
203
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
204
        curl \
drbh's avatar
drbh committed
205
        git \
206
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
207

208
209
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
210

211
# Copy build artifacts from flash attention builder
212
213
214
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
215

216
# Copy build artifacts from flash attention v2 builder
217
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
218

219
# Copy build artifacts from custom kernels builder
220
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
221
# Copy build artifacts from exllama kernels builder
222
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
223
224
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
225
# Copy build artifacts from awq kernels builder
226
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
227
# Copy build artifacts from eetq kernels builder
228
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
229
230
231
# Copy build artifacts from fbgemm builder
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from vllm builder
232
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
drbh's avatar
drbh committed
233
234
235
236
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

OlivierDehaene's avatar
OlivierDehaene committed
237
# Install flash-attention dependencies
238
RUN pip install einops --no-cache-dir
239

Olivier Dehaene's avatar
Olivier Dehaene committed
240
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
241
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
242
COPY server server
243
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
244
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
245
    make gen-server && \
fxmarty's avatar
fxmarty committed
246
    pip install -r requirements_cuda.txt && \
247
    pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
OlivierDehaene's avatar
OlivierDehaene committed
248
249
250
    pip install nvidia-nccl-cu12==2.22.3

ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
Olivier Dehaene's avatar
Olivier Dehaene committed
251

Nicolas Patry's avatar
Nicolas Patry committed
252
253
254
255
256
257
258
259
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

260
# Install benchmarker
261
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
262
# Install router
263
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
264
# Install launcher
265
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
266

267

fxmarty's avatar
fxmarty committed
268
# AWS Sagemaker compatible image
269
FROM base AS sagemaker
270
271
272
273
274
275

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

276
# Final image
277
278
FROM base

oOraph's avatar
oOraph committed
279
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
280
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
281
282

ENTRYPOINT ["/tgi-entrypoint.sh"]
drbh's avatar
drbh committed
283
# CMD ["--json-output"]