Dockerfile 8.94 KB
Newer Older
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
FROM chef as planner
ur4t's avatar
ur4t committed
8
COPY Cargo.lock Cargo.lock
9
10
11
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
12
COPY benchmark benchmark
13
14
15
16
17
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
18
19
20
21
22
23

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
24

25
COPY --from=planner /usr/src/recipe.json recipe.json
26
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
27

Nicolas Patry's avatar
Nicolas Patry committed
28
29
30
ARG GIT_SHA
ARG DOCKER_LABEL

31
COPY Cargo.toml Cargo.toml
32
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
33
COPY proto proto
34
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
35
COPY router router
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
36
COPY launcher launcher
37
RUN cargo build --profile release-opt
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
38

39
40
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
41
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install
42

OlivierDehaene's avatar
OlivierDehaene committed
43
ARG PYTORCH_VERSION=2.3.0
44
ARG PYTHON_VERSION=3.10
45
# Keep in sync with `server/pyproject.toml
46
ARG CUDA_VERSION=12.1
drbh's avatar
drbh committed
47
ARG MAMBA_VERSION=24.3.0-0
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
79
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
80
81
82
83
84
85
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
FROM pytorch-install as kernel-builder

86
87
ARG MAX_JOBS=8

88
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
89
        ninja-build cmake \
90
91
92
93
94
95
96
97
98
99
100
101
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
FROM kernel-builder as flash-att-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

102
103
104
105
106
107
108
109
# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
110
RUN make build-flash-attention-v2-cuda
111

112
113
114
115
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
116
117
118
119
120
121
122
123

RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

124
125
126
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

127
128
129
130
131
132
133
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

134
135
136
137
138
139
140
# Build eetq kernels
FROM kernel-builder as eetq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

141
142
143
# Build marlin kernels
FROM kernel-builder as marlin-kernels-builder
WORKDIR /usr/src
144
COPY server/marlin/ .
145
# Build specific version of transformers
146
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
147

148
# Build Transformers CUDA kernels
149
FROM kernel-builder as custom-kernels-builder
150
WORKDIR /usr/src
151
COPY server/custom_kernels/ .
152
# Build specific version of transformers
153
RUN python setup.py build
154

155
156
157
158
159
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder

WORKDIR /usr/src

OlivierDehaene's avatar
OlivierDehaene committed
160
161
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"

162
163
164
COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
165
RUN make build-vllm-cuda
166

drbh's avatar
drbh committed
167
168
169
170
171
172
# Build mamba kernels
FROM kernel-builder as mamba-builder
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

173
# Text Generation Inference base image
174
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
175
176
177
178
179
180
181

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
182
    HF_HUB_ENABLE_HF_TRANSFER=1 \
183
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
184

185
WORKDIR /usr/src
186

187
188
189
190
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
191
        curl \
drbh's avatar
drbh committed
192
        git \
193
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
194

195
196
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
197

198
# Copy build artifacts from flash attention builder
199
200
201
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
202

203
# Copy build artifacts from flash attention v2 builder
204
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
205

206
# Copy build artifacts from custom kernels builder
207
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
208
# Copy build artifacts from exllama kernels builder
209
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
210
211
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
212
# Copy build artifacts from awq kernels builder
213
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
214
# Copy build artifacts from eetq kernels builder
215
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
216
# Copy build artifacts from marlin kernels builder
217
COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
218

219
# Copy builds artifacts from vllm builder
220
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
221

drbh's avatar
drbh committed
222
223
224
225
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

OlivierDehaene's avatar
OlivierDehaene committed
226
# Install flash-attention dependencies
227
RUN pip install einops --no-cache-dir
228

Olivier Dehaene's avatar
Olivier Dehaene committed
229
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
230
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
231
COPY server server
232
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
233
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
234
    make gen-server && \
fxmarty's avatar
fxmarty committed
235
    pip install -r requirements_cuda.txt && \
OlivierDehaene's avatar
OlivierDehaene committed
236
    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
Olivier Dehaene's avatar
Olivier Dehaene committed
237

Nicolas Patry's avatar
Nicolas Patry committed
238
239
240
241
242
243
244
245
# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

246
# Install benchmarker
247
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
248
# Install router
249
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
250
# Install launcher
251
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
252

253

fxmarty's avatar
fxmarty committed
254
# AWS Sagemaker compatible image
255
256
257
258
259
260
261
FROM base as sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

262
# Final image
263
264
FROM base

oOraph's avatar
oOraph committed
265
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
OlivierDehaene's avatar
OlivierDehaene committed
266
RUN chmod +x /tgi-entrypoint.sh
oOraph's avatar
oOraph committed
267
268

ENTRYPOINT ["/tgi-entrypoint.sh"]
269
CMD ["--json-output"]