[CI/Build] refactor dockerfile & fix pip cache

[CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859)

[CI/Build] refactor dockerfile & fix pip cache
[CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859)
d03d64fd · youkaichao · GitHub · 78107fa0 · d03d64fd · d03d64fd
Unverified Commit d03d64fd authored Apr 04, 2024 by youkaichao Committed by GitHub Apr 04, 2024
Show whitespace changes
Inline Side-by-side

Showing with 49 additions and 41 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +1 -1

Dockerfile Dockerfile +48 -37

docs/source/conf.py docs/source/conf.py +0 -3

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -90,7 +90,7 @@ steps:
  - bash run-benchmarks.sh
 - label: Documentation Build
-  working_dir: "/vllm-workspace/docs"
+  working_dir: "/vllm-workspace/test_docs/docs"
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt

--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@
 # to run the OpenAI compatible server.
 #################### BASE BUILD IMAGE ####################
+# prepare basic build environment
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
@@ -34,7 +35,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
-#################### EXTENSION BUILD IMAGE ####################
+#################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 # install build dependencies
@@ -45,14 +46,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
-# copy input files
+# files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
+COPY vllm vllm
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@@ -65,7 +66,15 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    python3 setup.py build_ext --inplace
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist
+# the `vllm_nccl` package must be installed from source distribution
+# pip is too smart to store a wheel in the cache, and other CI jobs
+# will directly use the wheel from the cache, which is not what we want.
+# we need to remove it manually
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip cache remove vllm_nccl*
 #################### EXTENSION Build IMAGE ####################
 #################### FLASH_ATTENTION Build IMAGE ####################
@@ -85,57 +94,59 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
 #################### FLASH_ATTENTION Build IMAGE ####################
-#################### TEST IMAGE ####################
+#################### vLLM installation IMAGE ####################
-# image to run unit testing suite
+# image with vLLM installed
-FROM dev AS test
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
-# copy pytorch extensions separately to avoid having to rebuild
-# when python code changes
 WORKDIR /vllm-workspace
-# ADD is used to preserve directory structure
-ADD . /vllm-workspace/
+RUN apt-get update -y \
-COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
+    && apt-get install -y python3-pip git vim
-# Install flash attention (from pre-built wheel)
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install dist/*.whl --verbose
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    --mount=type=cache,target=/root/.cache/pip \
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-# ignore build dependencies installation because we are using pre-complied extensions
+#################### vLLM installation IMAGE ####################
-RUN rm pyproject.toml
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-#################### TEST IMAGE ####################
-#################### RUNTIME BASE IMAGE ####################
+#################### TEST IMAGE ####################
-# We used base cuda image because pytorch installs its own cuda libraries.
+# image to run unit testing suite
-# However pynccl depends on cuda libraries so we had to switch to the runtime image
+# note that this uses vllm installed by `pip`
-# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
+FROM vllm-base AS test
-FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
-# libnccl required for ray
+ADD . /vllm-workspace/
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
-WORKDIR /workspace
+# install development dependencies (for testing)
-COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-dev.txt
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-#################### RUNTIME BASE IMAGE ####################
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN mv vllm test_docs/
+#################### TEST IMAGE ####################
 #################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer modelscope
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
 ENV VLLM_USAGE_SOURCE production-docker-image
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -11,13 +11,10 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 import logging
-import os
 import sys
 from sphinx.ext import autodoc
-sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
 logger = logging.getLogger(__name__)
 # -- Project information -----------------------------------------------------