Unverified Commit 06ec4867 authored by Thomas Parnell's avatar Thomas Parnell Committed by GitHub
Browse files

Install `flash_attn` in Docker image (#3396)

parent 8fe83865
...@@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1 ...@@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
RUN python3 setup.py build_ext --inplace RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE #################### #################### EXTENSION Build IMAGE ####################
#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.6
ENV FLASH_ATTN_VERSION=${flash_attn_version}
WORKDIR /usr/src/flash-attention-v2
# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir
#################### FLASH_ATTENTION Build IMAGE ####################
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
# image to run unit testing suite # image to run unit testing suite
...@@ -68,6 +84,9 @@ WORKDIR /vllm-workspace ...@@ -68,6 +84,9 @@ WORKDIR /vllm-workspace
# ADD is used to preserve directory structure # ADD is used to preserve directory structure
ADD . /vllm-workspace/ ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
# ignore build dependencies installation because we are using pre-complied extensions # ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
...@@ -88,6 +107,11 @@ WORKDIR /workspace ...@@ -88,6 +107,11 @@ WORKDIR /workspace
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt pip install -r requirements.txt
# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### RUNTIME BASE IMAGE #################### #################### RUNTIME BASE IMAGE ####################
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment