Dockerfile 10.1 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef AS planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
14
15
16
17
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
18
19
20
21
22
23

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
24

25
COPY --from=planner /usr/src/recipe.json recipe.json
26
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
27

Nicolas Patry's avatar
Nicolas Patry committed
28
29
30
ARG GIT_SHA
ARG DOCKER_LABEL

31
COPY Cargo.toml Cargo.toml
32
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
33
COPY proto proto
34
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
35
COPY router router
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
36
COPY launcher launcher
37
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
38

39
40
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
41
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
42

43
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
OlivierDehaene's avatar
OlivierDehaene committed
44
ARG PYTORCH_VERSION=2.3.0
45

46
ARG PYTHON_VERSION=3.10
47
# Keep in sync with `server/pyproject.toml
48
ARG CUDA_VERSION=12.1
drbh's avatar
drbh committed
49
ARG MAMBA_VERSION=24.3.0-0
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
81
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
82
83
84
85
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
86
FROM pytorch-install AS kernel-builder
87

88
89
ARG MAX_JOBS=8

90
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
91
        ninja-build cmake \
92
93
94
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
95
FROM kernel-builder AS flash-att-builder
96
97
98
99
100
101
102
103

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

104
# Build Flash Attention v2 CUDA kernels
105
FROM kernel-builder AS flash-att-v2-builder
106
107
108
109
110
111

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
112
RUN make build-flash-attention-v2-cuda
113

114
# Build Transformers exllama kernels
115
FROM kernel-builder AS exllama-kernels-builder
116
117
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
118
119
120
121

RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
122
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
123
124
125
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

126
127
128
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

129
# Build Transformers awq kernels
130
FROM kernel-builder AS awq-kernels-builder
131
132
133
134
135
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

136
# Build eetq kernels
137
FROM kernel-builder AS eetq-kernels-builder
138
139
140
141
142
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

143
# Build marlin kernels
144
FROM kernel-builder AS marlin-kernels-builder
145
WORKDIR /usr/src
146
COPY server/marlin/ .
147
# Build specific version of transformers
148
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
149

drbh's avatar
drbh committed
150
# Build Lorax Punica kernels
151
FROM kernel-builder AS lorax-punica-builder
drbh's avatar
drbh committed
152
153
154
155
156
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

157
# Build Transformers CUDA kernels
158
FROM kernel-builder AS custom-kernels-builder
159
WORKDIR /usr/src
160
COPY server/custom_kernels/ .
161
# Build specific version of transformers
162
RUN python setup.py build
163

164
165
166
167
168
169
170
171
172
173
174
# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile
COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
COPY server/fix_torch90a.sh fix_torch90a.sh

RUN make build-fbgemm

175
# Build vllm CUDA kernels
176
FROM kernel-builder AS vllm-builder
177
178
179

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
180
181
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

182
183
184
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
185
RUN make build-vllm-cuda
186

drbh's avatar
drbh committed
187
# Build mamba kernels
188
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
189
190
191
192
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

193
# Text Generation Inference base image
194
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
195
196
197
198
199
200
201

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
202
    HF_HUB_ENABLE_HF_TRANSFER=1 \
203
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
204

205
WORKDIR /usr/src
206

207
208
209
210
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
211
        curl \
drbh's avatar
drbh committed
212
        git \
213
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
214

215
216
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
217

218
# Copy build artifacts from flash attention builder
219
220
221
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
222

223
# Copy build artifacts from flash attention v2 builder
224
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
225

226
# Copy build artifacts from custom kernels builder
227
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
228
# Copy build artifacts from exllama kernels builder
229
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
230
231
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
232
# Copy build artifacts from awq kernels builder
233
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
234
# Copy build artifacts from eetq kernels builder
235
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
236
# Copy build artifacts from marlin kernels builder
237
COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
drbh's avatar
drbh committed
238
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
239
240
241
# Copy build artifacts from fbgemm builder
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from vllm builder
242
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
drbh's avatar
drbh committed
243
244
245
246
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

OlivierDehaene's avatar
OlivierDehaene committed
247
# Install flash-attention dependencies
248
RUN pip install einops --no-cache-dir
249

Olivier Dehaene's avatar
Olivier Dehaene committed
250
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
251
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
252
COPY server server
253
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
254
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
255
    make gen-server && \
fxmarty's avatar
fxmarty committed
256
    pip install -r requirements_cuda.txt && \
257
258
259
260
    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3

ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
Olivier Dehaene's avatar
Olivier Dehaene committed
261

Nicolas Patry's avatar
Nicolas Patry committed
262
263
264
265
266
267
268
269
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

270
# Install benchmarker
271
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
272
# Install router
273
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
274
# Install launcher
275
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
276

277

fxmarty's avatar
fxmarty committed
278
# AWS Sagemaker compatible image
279
FROM base AS sagemaker
280
281
282
283
284
285

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

286
# Final image
287
288
FROM base

oOraph's avatar
oOraph committed
289
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
290
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
291
292

ENTRYPOINT ["/tgi-entrypoint.sh"]
drbh's avatar
drbh committed
293
# CMD ["--json-output"]