Dockerfile 9.33 KB
Newer Older
1
# Rust builder
xuxzh1's avatar
last  
xuxzh1 committed
2
FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

xuxzh1's avatar
last  
xuxzh1 committed
7
8
FROM chef AS planner
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
14
15
16
17
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
18
19
20
21
22
23

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
24

25
COPY --from=planner /usr/src/recipe.json recipe.json
xuxzh1's avatar
last  
xuxzh1 committed
26
27
28
29
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

ARG GIT_SHA
ARG DOCKER_LABEL
Olivier Dehaene's avatar
Olivier Dehaene committed
30

31
COPY Cargo.toml Cargo.toml
32
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
33
COPY proto proto
34
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
35
COPY router router
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
36
COPY launcher launcher
xuxzh1's avatar
last  
xuxzh1 committed
37
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
38

39
40
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
xuxzh1's avatar
last  
xuxzh1 committed
41
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
42

OlivierDehaene's avatar
OlivierDehaene committed
43
ARG PYTORCH_VERSION=2.3.0
44
ARG PYTHON_VERSION=3.10
45
# Keep in sync with `server/pyproject.toml
46
ARG CUDA_VERSION=12.1
xuxzh1's avatar
last  
xuxzh1 committed
47
ARG MAMBA_VERSION=24.3.0-0
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
79
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
80
81
82
83
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
xuxzh1's avatar
last  
xuxzh1 committed
84
FROM pytorch-install AS kernel-builder
85

86
87
ARG MAX_JOBS=8

88
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
89
        ninja-build cmake \
90
91
92
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
xuxzh1's avatar
last  
xuxzh1 committed
93
FROM kernel-builder AS flash-att-builder
94
95
96
97
98
99
100
101

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

102
# Build Flash Attention v2 CUDA kernels
xuxzh1's avatar
last  
xuxzh1 committed
103
FROM kernel-builder AS flash-att-v2-builder
104
105
106
107
108
109

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
110
RUN make build-flash-attention-v2-cuda
111

112
# Build Transformers exllama kernels
xuxzh1's avatar
last  
xuxzh1 committed
113
FROM kernel-builder AS exllama-kernels-builder
114
115
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
116
117
118
119

RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
xuxzh1's avatar
last  
xuxzh1 committed
120
FROM kernel-builder AS exllamav2-kernels-builder
Nicolas Patry's avatar
Nicolas Patry committed
121
122
123
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

124
125
126
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

127
# Build Transformers awq kernels
xuxzh1's avatar
last  
xuxzh1 committed
128
FROM kernel-builder AS awq-kernels-builder
129
130
131
132
133
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

134
# Build eetq kernels
xuxzh1's avatar
last  
xuxzh1 committed
135
FROM kernel-builder AS eetq-kernels-builder
136
137
138
139
140
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

xuxzh1's avatar
last  
xuxzh1 committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Build marlin kernels
FROM kernel-builder AS marlin-kernels-builder
WORKDIR /usr/src
COPY server/marlin/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Lorax Punica kernels
FROM kernel-builder AS lorax-punica-builder
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica

155
# Build Transformers CUDA kernels
xuxzh1's avatar
last  
xuxzh1 committed
156
FROM kernel-builder AS custom-kernels-builder
157
WORKDIR /usr/src
158
COPY server/custom_kernels/ .
159
# Build specific version of transformers
160
RUN python setup.py build
161

162
# Build vllm CUDA kernels
xuxzh1's avatar
last  
xuxzh1 committed
163
FROM kernel-builder AS vllm-builder
164
165
166

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
167
168
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

169
170
171
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
172
RUN make build-vllm-cuda
173

drbh's avatar
drbh committed
174
# Build mamba kernels
xuxzh1's avatar
last  
xuxzh1 committed
175
FROM kernel-builder AS mamba-builder
drbh's avatar
drbh committed
176
177
178
179
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

180
# Text Generation Inference base image
xuxzh1's avatar
last  
xuxzh1 committed
181
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
182
183
184
185
186
187
188

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
189
    HF_HUB_ENABLE_HF_TRANSFER=1 \
190
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
191

192
WORKDIR /usr/src
193

194
195
196
197
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
198
        curl \
xuxzh1's avatar
last  
xuxzh1 committed
199
        git \
200
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
201

202
203
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
204

205
# Copy build artifacts from flash attention builder
206
207
208
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
209

210
# Copy build artifacts from flash attention v2 builder
xuxzh1's avatar
last  
xuxzh1 committed
211
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
212

213
# Copy build artifacts from custom kernels builder
214
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
215
# Copy build artifacts from exllama kernels builder
216
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
217
218
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
219
# Copy build artifacts from awq kernels builder
220
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
221
# Copy build artifacts from eetq kernels builder
222
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
xuxzh1's avatar
last  
xuxzh1 committed
223
224
225
# Copy build artifacts from marlin kernels builder
COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
226

227
# Copy builds artifacts from vllm builder
228
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
229

drbh's avatar
drbh committed
230
231
232
233
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

OlivierDehaene's avatar
OlivierDehaene committed
234
# Install flash-attention dependencies
235
RUN pip install einops --no-cache-dir
236

Olivier Dehaene's avatar
Olivier Dehaene committed
237
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
238
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
239
COPY server server
240
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
241
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
242
    make gen-server && \
fxmarty's avatar
fxmarty committed
243
    pip install -r requirements_cuda.txt && \
OlivierDehaene's avatar
OlivierDehaene committed
244
    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
Olivier Dehaene's avatar
Olivier Dehaene committed
245

xuxzh1's avatar
last  
xuxzh1 committed
246
247
248
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
249
250
251
252
253
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

xuxzh1's avatar
last  
xuxzh1 committed
254
255
256
257
258
259
260
261
# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher


fxmarty's avatar
fxmarty committed
262
# AWS Sagemaker compatible image
xuxzh1's avatar
last  
xuxzh1 committed
263
FROM base AS sagemaker
264
265
266
267
268
269

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

270
# Final image
271
272
FROM base

oOraph's avatar
oOraph committed
273
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
274
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
275
276

ENTRYPOINT ["/tgi-entrypoint.sh"]
xuxzh1's avatar
last  
xuxzh1 committed
277
# CMD ["--json-output"]