Dockerfile 8.39 KB
Newer Older
1
# Rust builder
Nicolas Patry's avatar
Nicolas Patry committed
2
FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
3
4
WORKDIR /usr/src

5
6
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

7
8
9
10
FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
11
COPY benchmark benchmark
12
13
14
15
16
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
17

18
ARG GIT_SHA
19
ARG DOCKER_LABEL
20

21
22
23
24
25
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP
Olivier Dehaene's avatar
Olivier Dehaene committed
26

27
28
COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json
Olivier Dehaene's avatar
Olivier Dehaene committed
29

30
COPY Cargo.toml Cargo.toml
31
COPY rust-toolchain.toml rust-toolchain.toml
Olivier Dehaene's avatar
Olivier Dehaene committed
32
COPY proto proto
33
COPY benchmark benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
34
COPY router router
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
35
COPY launcher launcher
36
RUN cargo build --release
Olivier Dehaene's avatar
v0.1.0  
Olivier Dehaene committed
37

38
39
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
40
FROM nvidia/cuda:12.1.0-devel-ubuntu20.04 as pytorch-install
41

42
43
ARG PYTORCH_VERSION=2.1.1
ARG PYTHON_VERSION=3.10
44
# Keep in sync with `server/pyproject.toml
45
46
ARG CUDA_VERSION=12.1
ARG MAMBA_VERSION=23.3.1-1
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
78
                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
79
80
81
82
83
84
    esac && \
    /opt/conda/bin/conda clean -ya

# CUDA kernels builder image
FROM pytorch-install as kernel-builder

85
86
ARG MAX_JOBS=8

87
88
89
90
91
92
93
94
95
96
97
98
99
100
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        ninja-build \
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
FROM kernel-builder as flash-att-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

101
102
103
104
105
106
107
108
# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
fxmarty's avatar
fxmarty committed
109
RUN make build-flash-attention-v2-cuda
110

111
112
113
114
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
Nicolas Patry's avatar
Nicolas Patry committed
115
116
117
118
119
120
121
122

RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

123
124
125
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

126
127
128
129
130
131
132
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

133
134
135
136
137
138
139
# Build eetq kernels
FROM kernel-builder as eetq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

140
# Build Transformers CUDA kernels
141
FROM kernel-builder as custom-kernels-builder
142
WORKDIR /usr/src
143
COPY server/custom_kernels/ .
144
# Build specific version of transformers
145
RUN python setup.py build
146

147
148
149
150
151
152
153
154
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder

WORKDIR /usr/src

COPY server/Makefile-vllm Makefile

# Build specific version of vllm
fxmarty's avatar
fxmarty committed
155
RUN make build-vllm-cuda
156

drbh's avatar
drbh committed
157
158
159
160
161
162
# Build mamba kernels
FROM kernel-builder as mamba-builder
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all

OlivierDehaene's avatar
OlivierDehaene committed
163
164
165
166
167
# Build megablocks
FROM kernel-builder as megablocks-builder

RUN pip install git+https://github.com/OlivierDehaene/megablocks@181709df192de9a941fdf3a641cdc65a0462996e

168
# Text Generation Inference base image
169
FROM nvidia/cuda:12.1.0-base-ubuntu20.04 as base
170
171
172
173
174
175
176

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
    CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
177
    HF_HUB_ENABLE_HF_TRANSFER=1 \
178
    PORT=80
Olivier Dehaene's avatar
Olivier Dehaene committed
179

180
WORKDIR /usr/src
181

182
183
184
185
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
186
        curl \
187
        && rm -rf /var/lib/apt/lists/*
Olivier Dehaene's avatar
Olivier Dehaene committed
188

OlivierDehaene's avatar
OlivierDehaene committed
189
190
# Copy conda with PyTorch and Megablocks installed
COPY --from=megablocks-builder /opt/conda /opt/conda
Olivier Dehaene's avatar
Olivier Dehaene committed
191

192
# Copy build artifacts from flash attention builder
193
194
195
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
196

197
# Copy build artifacts from flash attention v2 builder
198
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
199

200
# Copy build artifacts from custom kernels builder
201
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
202
# Copy build artifacts from exllama kernels builder
203
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Nicolas Patry's avatar
Nicolas Patry committed
204
205
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
206
# Copy build artifacts from awq kernels builder
207
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
208
# Copy build artifacts from eetq kernels builder
209
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
210

211
# Copy builds artifacts from vllm builder
212
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
213

drbh's avatar
drbh committed
214
215
216
217
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

218
219
# Install flash-attention dependencies
RUN pip install einops --no-cache-dir
220

Olivier Dehaene's avatar
Olivier Dehaene committed
221
# Install server
Nicolas Patry's avatar
Nicolas Patry committed
222
COPY proto proto
Olivier Dehaene's avatar
Olivier Dehaene committed
223
COPY server server
224
COPY server/Makefile server/Makefile
Olivier Dehaene's avatar
Olivier Dehaene committed
225
RUN cd server && \
Nicolas Patry's avatar
Nicolas Patry committed
226
    make gen-server && \
fxmarty's avatar
fxmarty committed
227
    pip install -r requirements_cuda.txt && \
228
    pip install ".[bnb, accelerate, quantize, peft]" --no-cache-dir
Olivier Dehaene's avatar
Olivier Dehaene committed
229

230
231
# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
Olivier Dehaene's avatar
Olivier Dehaene committed
232
# Install router
233
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
Nicolas Patry's avatar
Nicolas Patry committed
234
# Install launcher
235
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
Olivier Dehaene's avatar
Olivier Dehaene committed
236

237
238
239
240
241
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

fxmarty's avatar
fxmarty committed
242
# AWS Sagemaker compatible image
243
244
245
246
247
248
249
FROM base as sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

250
# Final image
251
252
FROM base

253
ENTRYPOINT ["text-generation-launcher"]
254
CMD ["--json-output"]