Dockerfile 8.93 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
8
9
10
FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
11
COPY benchmark benchmark
12
13
14
15
16
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
17
18
19
20
21
22

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
23

24
COPY --from=planner /usr/src/recipe.json recipe.json
25
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
26

Nicolas Patry's avatar
Nicolas Patry committed
27
28
29
ARG GIT_SHA
ARG DOCKER_LABEL

30
COPY Cargo.toml Cargo.toml
31
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
32
COPY proto proto
33
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
34
COPY router router
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
35
COPY launcher launcher
36
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
37

38
39
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
40
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install
41

OlivierDehaene's avatar
OlivierDehaene committed
42
ARG PYTORCH_VERSION=2.3.0
43
ARG PYTHON_VERSION=3.10
44
# Keep in sync with `server/pyproject.toml
45
ARG CUDA_VERSION=12.1
drbh's avatar
drbh committed
46
ARG MAMBA_VERSION=24.3.0-0
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
78
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
79
80
81
82
83
84
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
FROM pytorch-install as kernel-builder

85
86
ARG MAX_JOBS=8

87
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
88
        ninja-build cmake \
89
90
91
92
93
94
95
96
97
98
99
100
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
FROM kernel-builder as flash-att-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

101
102
103
104
105
106
107
108
# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
109
RUN make build-flash-attention-v2-cuda
110

111
112
113
114
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
115
116
117
118
119
120
121
122

RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

123
124
125
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

126
127
128
129
130
131
132
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

133
134
135
136
137
138
139
# Build eetq kernels
FROM kernel-builder as eetq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

140
141
142
143
144
145
146
# Build marlin kernels
FROM kernel-builder as marlin-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-marlin Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin

147
# Build Transformers CUDA kernels
148
FROM kernel-builder as custom-kernels-builder
149
WORKDIR /usr/src
150
COPY server/custom_kernels/ .
151
# Build specific version of transformers
152
RUN python setup.py build
153

154
155
156
157
158
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
159
160
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

161
162
163
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
164
RUN make build-vllm-cuda
165

drbh's avatar
drbh committed
166
167
168
169
170
171
# Build mamba kernels
FROM kernel-builder as mamba-builder
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

172
# Text Generation Inference base image
173
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
174
175
176
177
178
179
180

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
181
    HF_HUB_ENABLE_HF_TRANSFER=1 \
182
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
183

184
WORKDIR /usr/src
185

186
187
188
189
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
190
        curl \
drbh's avatar
drbh committed
191
        git \
192
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
193

194
195
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
196

197
# Copy build artifacts from flash attention builder
198
199
200
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
201

202
# Copy build artifacts from flash attention v2 builder
203
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
204

205
# Copy build artifacts from custom kernels builder
206
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
207
# Copy build artifacts from exllama kernels builder
208
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
209
210
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
211
# Copy build artifacts from awq kernels builder
212
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
213
# Copy build artifacts from eetq kernels builder
214
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
215
216
# Copy build artifacts from marlin kernels builder
COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
217

218
# Copy builds artifacts from vllm builder
219
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
220

drbh's avatar
drbh committed
221
222
223
224
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

OlivierDehaene's avatar
OlivierDehaene committed
225
# Install flash-attention dependencies
226
RUN pip install einops --no-cache-dir
227

Olivier Dehaene's avatar
Olivier Dehaene committed
228
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
229
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
230
COPY server server
231
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
232
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
233
    make gen-server && \
fxmarty's avatar
fxmarty committed
234
    pip install -r requirements_cuda.txt && \
OlivierDehaene's avatar
OlivierDehaene committed
235
    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
Olivier Dehaene's avatar
Olivier Dehaene committed
236

Nicolas Patry's avatar
Nicolas Patry committed
237
238
239
240
241
242
243
244
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

245
# Install benchmarker
246
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
247
# Install router
248
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
249
# Install launcher
250
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
251

252

fxmarty's avatar
fxmarty committed
253
# AWS Sagemaker compatible image
254
255
256
257
258
259
260
FROM base as sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

261
# Final image
262
263
FROM base

oOraph's avatar
oOraph committed
264
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
265
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
266
267

ENTRYPOINT ["/tgi-entrypoint.sh"]
268
CMD ["--json-output"]