[Kernel] Build flash-attn from source (#8245)

71c60491 · Luka Govedič · GitHub · 0faab90e · 71c60491 · 71c60491
Unverified Commit 71c60491 authored Sep 21, 2024 by Luka Govedič Committed by GitHub Sep 20, 2024
9 changed files
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.gitignore
+++ b/.gitignore
 # vllm commit id, generated by setup.py
 vllm/commit_id.py
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -12,6 +15,8 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+cmake-build-*/
+CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.26)
+# When building directly using CMake, make sure you run the install step
+# (it places the .so files in the correct location).
+#
+# Example:
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
+# cmake --build . --target install
+#
+# If you want to only build one target, make sure to install it manually:
+# cmake --build . --target _C
+# cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -70,19 +84,6 @@ endif()
 find_package(Torch REQUIRED)
 #
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
-#
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
 message(STATUS "Enabling core extension.")
 # Define _core_C extension
@@ -100,8 +101,6 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
-add_dependencies(default _core_C)
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -167,6 +166,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
+include(FetchContent)
 #
 # Define other extension targets
 #
@@ -190,7 +191,6 @@ set(VLLM_EXT_SRC
  "csrc/torch_bindings.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  include(FetchContent)
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  FetchContent_Declare(
        cutlass
@@ -283,6 +283,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    csrc/quantization/machete/machete_pytorch.cu)
 endif()
+message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
  DESTINATION vllm
@@ -313,6 +314,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      "csrc/moe/marlin_moe_ops.cu")
 endif()
+message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
  DESTINATION vllm
@@ -323,7 +325,6 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
 if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  # _rocm_C extension
@@ -343,16 +344,63 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    WITH_SOABI)
 endif()
+# vllm-flash-attn currently only supported on CUDA
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
+  return()
+endif ()
-if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+#
-  message(STATUS "Enabling C extension.")
+# Build vLLM flash attention from source
-  add_dependencies(default _C)
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
+# If no component is specified, vllm-flash-attn is still installed.
-  message(STATUS "Enabling moe extension.")
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
-  add_dependencies(default _moe_C)
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
 endif()
-if(VLLM_GPU_LANG STREQUAL "HIP")
+if(VLLM_FLASH_ATTN_SRC_DIR)
-  message(STATUS "Enabling rocm extension.")
+  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
-  add_dependencies(default _rocm_C)
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+          GIT_PROGRESS TRUE
+  )
 endif()
+# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
+set(VLLM_PARENT_BUILD ON)
+# Make sure vllm-flash-attn install rules are nested under vllm/
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
+# Copy over the vllm-flash-attn python files
+install(
+        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+        DESTINATION vllm/vllm_flash_attn
+        COMPONENT vllm_flash_attn_c
+        FILES_MATCHING PATTERN "*.py"
+)
+# Nothing after vllm-flash-attn, see comment about macros above
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,6 +48,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################
 #################### WHEEL BUILD IMAGE ####################

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -364,5 +364,5 @@ function (define_gpu_extension_target GPU_MOD_NAME)
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
  endif()
-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
 endfunction()
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -8,4 +8,3 @@ torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
--- a/setup.py
+++ b/setup.py
@@ -6,6 +6,7 @@ import re
 import subprocess
 import sys
 import warnings
+from pathlib import Path
 from shutil import which
 from typing import Dict, List
@@ -152,15 +153,8 @@ class cmake_build_ext(build_ext):
        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
-        # where .so files will be written, should be the same for all extensions
-        # that use the same CMakeLists.txt.
-        outdir = os.path.abspath(
-            os.path.dirname(self.get_ext_fullpath(ext.name)))
        cmake_args = [
            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
-            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
        ]
@@ -224,10 +218,12 @@ class cmake_build_ext(build_ext):
            os.makedirs(self.build_temp)
        targets = []
+        target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
+                                              "vllm_flash_attn.")
        # Build all the extensions
        for ext in self.extensions:
            self.configure(ext)
-            targets.append(remove_prefix(ext.name, "vllm."))
+            targets.append(target_name(ext.name))
        num_jobs, _ = self.compute_num_jobs()
@@ -240,6 +236,28 @@ class cmake_build_ext(build_ext):
        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
+        # Install the libraries
+        for ext in self.extensions:
+            # Install the extension into the proper location
+            outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
+            # Skip if the install directory is the same as the build directory
+            if outdir == self.build_temp:
+                continue
+            # CMake appends the extension prefix to the install path,
+            # and outdir already contains that prefix, so we need to remove it.
+            prefix = outdir
+            for i in range(ext.name.count('.')):
+                prefix = prefix.parent
+            # prefix here should actually be the same for all components
+            install_args = [
+                "cmake", "--install", ".", "--prefix", prefix, "--component",
+                target_name(ext.name)
+            ]
+            subprocess.check_call(install_args, cwd=self.build_temp)
 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"
@@ -467,6 +485,10 @@ if _is_cuda() or _is_hip():
 if _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+if _is_cuda():
+    ext_modules.append(
+        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))

--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -19,8 +19,13 @@ if TYPE_CHECKING:
    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                          ModelInputForGPUWithSamplingMetadata)
-from vllm_flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func
+# yapf: disable
-from vllm_flash_attn import flash_attn_with_kvcache as _flash_attn_with_kvcache
+from vllm.vllm_flash_attn import (
+    flash_attn_varlen_func as _flash_attn_varlen_func)
+from vllm.vllm_flash_attn import (
+    flash_attn_with_kvcache as _flash_attn_with_kvcache)
+# yapf: enable
 @torch.library.custom_op("vllm::flash_attn_varlen_func", mutates_args=[])

--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -244,8 +244,7 @@ def which_attn_to_use(
    # FlashAttn is valid for the model, checking if the package is installed.
    if selected_backend == _Backend.FLASH_ATTN:
        try:
-            import vllm_flash_attn  # noqa: F401
+            import vllm.vllm_flash_attn  # noqa: F401
            from vllm.attention.backends.flash_attn import (  # noqa: F401
                FlashAttentionBackend)
@@ -258,8 +257,9 @@ def which_attn_to_use(
        except ImportError:
            logger.info(
                "Cannot use FlashAttention-2 backend because the "
-                "vllm_flash_attn package is not found. "
+                "vllm.vllm_flash_attn package is not found. "
-                "`pip install vllm-flash-attn` for better performance.")
+                "Make sure that vllm_flash_attn was built and installed "
+                "(on by default).")
            selected_backend = _Backend.XFORMERS
    return selected_backend