Dockerfile_amd 6.59 KB
Newer Older
fxmarty's avatar
fxmarty committed
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
fxmarty's avatar
fxmarty committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

ARG GIT_SHA
ARG DOCKER_LABEL

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
28
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
fxmarty's avatar
fxmarty committed
29
30
31
32
33
34
35

COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
36
RUN cargo build --profile release-opt
fxmarty's avatar
fxmarty committed
37
38

# Text Generation Inference base image for RoCm
fxmarty's avatar
fxmarty committed
39
FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update as base
fxmarty's avatar
fxmarty committed
40
41
42
43
44
45
46
47
48
49
50
51
52

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    build-essential \
    ca-certificates \
    ccache \
    curl \
    git \
    make \
    libssl-dev \
    g++ \
    # Needed to build VLLM & flash.
    rocthrust-dev \
    hipsparse-dev \
fxmarty's avatar
fxmarty committed
53
54
55
56
57
58
59
60
61
62
63
64
    hipblas-dev \
    hipblaslt-dev \
    rocblas-dev \
    hiprand-dev \
    rocrand-dev \
    miopen-hip-dev \
    hipfft-dev \
    hipcub-dev \
    hipsolver-dev \
    rccl-dev \
    cmake \
    python3-dev && \
fxmarty's avatar
fxmarty committed
65
66
67
68
    rm -rf /var/lib/apt/lists/*

# Keep in sync with `server/pyproject.toml
ARG MAMBA_VERSION=23.1.0-1
fxmarty's avatar
fxmarty committed
69
70
ARG PYTORCH_VERSION='2.3.0'
ARG ROCM_VERSION='6.0.2'
fxmarty's avatar
fxmarty committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
ARG PYTHON_VERSION='3.10.10'
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH

# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
# Install mamba
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    mamba init && \
    rm ~/mambaforge.sh

fxmarty's avatar
fxmarty committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Install flash-attention, torch dependencies
RUN pip install numpy einops ninja --no-cache-dir

RUN conda install intel::mkl-static intel::mkl-include
RUN pip uninstall -y triton && \
    git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
    cd triton/python && \
    pip install .

RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir

ARG _GLIBCXX_USE_CXX11_ABI="1"
ARG CMAKE_PREFIX_PATH="/opt/conda"
ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ARG BUILD_CAFFE2="0" \
    BUILD_CAFFE2_OPS="0" \
    USE_CUDA="0" \
    USE_ROCM="1" \
    BUILD_TEST="0" \
    USE_FBGEMM="0" \
    USE_NNPACK="0" \
    USE_QNNPACK="0" \
    USE_XNNPACK="0" \
    USE_FLASH_ATTENTION="1" \
    USE_MEM_EFF_ATTENTION="0"

RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install

# Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
ENV HIP_FORCE_DEV_KERNARG=1

120
121
122
# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
# However, Triton requires a tunning for each prompt length, which is prohibitive.
ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
fxmarty's avatar
fxmarty committed
123
124
125

FROM base AS kernel-builder

fxmarty's avatar
fxmarty committed
126
# # Build vllm kernels
fxmarty's avatar
fxmarty committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
FROM kernel-builder AS vllm-builder
WORKDIR /usr/src

COPY server/Makefile-vllm Makefile

# Build specific version of vllm
RUN make build-vllm-rocm

# Build Flash Attention v2 kernels
FROM kernel-builder AS flash-att-v2-builder
WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
RUN make build-flash-attention-v2-rocm

# Build Transformers CUDA kernels (gpt-neox and bloom)
FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src
COPY server/custom_kernels/ .
fxmarty's avatar
fxmarty committed
148
RUN python setup.py build
fxmarty's avatar
fxmarty committed
149

fxmarty's avatar
fxmarty committed
150
151
152
153
154
# Build exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .

fxmarty's avatar
fxmarty committed
155
RUN python setup.py build
fxmarty's avatar
fxmarty committed
156
157
158
159
160
161

# Build exllama v2 kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

fxmarty's avatar
fxmarty committed
162
RUN python setup.py build
fxmarty's avatar
fxmarty committed
163

fxmarty's avatar
fxmarty committed
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
FROM base as base-copy

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
fxmarty's avatar
fxmarty committed
179
180
181
182
183
184

# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
fxmarty's avatar
fxmarty committed
185
186
187
188
189
190
191
192

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
OlivierDehaene's avatar
OlivierDehaene committed
193
    pip install ".[accelerate, peft, outlines]" --no-cache-dir
fxmarty's avatar
fxmarty committed
194
195

# Install benchmarker
196
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
fxmarty's avatar
fxmarty committed
197
# Install router
198
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
fxmarty's avatar
fxmarty committed
199
# Install launcher
200
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
fxmarty's avatar
fxmarty committed
201
202

# AWS Sagemaker compatible image
fxmarty's avatar
fxmarty committed
203
204
FROM base as sagemaker

fxmarty's avatar
fxmarty committed
205
206
207
208
209
210
211
212
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base-copy

fxmarty's avatar
fxmarty committed
213
214
215
216
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]
fxmarty's avatar
fxmarty committed
217
CMD ["--json-output"]