Use single thread compilation for cuda12.1, torch2.1 to avoid OOM CI

9c531bdc · Tri Dao · 67ae6fd7 · 9c531bdc · 9c531bdc · 9c531bdc
Commit 9c531bdc authored Aug 14, 2023 by Tri Dao
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 3 deletions

.github/workflows/publish.yml .github/workflows/publish.yml +8 -0

flash_attn/__init__.py flash_attn/__init__.py +1 -1

setup.py setup.py +3 -2

No files found.
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -88,8 +88,12 @@ jobs:
      - name: Free up disk space
        if: ${{ runner.os == 'Linux' }}
+        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
+        # https://github.com/easimon/maximize-build-space/tree/test-report
        run: |
          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
      - name: Install CUDA ${{ matrix.cuda-version }}
        if: ${{ matrix.cuda-version != 'cpu' }}
@@ -137,6 +141,10 @@ jobs:
          pip install ninja packaging wheel
          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          # Currently for this setting the runner goes OOM if we pass --threads 4 to nvcc
+          if [[ ${MATRIX_CUDA_VERSION} == "12.1" && ${MATRIX_TORCH_VERSION} == "2.1" ]]; then
+            export FLASH_ATTENTION_FORCE_SINGLE_THREAD="TRUE"
+          fi
          # Limit MAX_JOBS otherwise the github runner goes OOM
          MAX_JOBS=1 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}

--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
-__version__ = "2.0.6"
+__version__ = "2.0.6.post1"
 from flash_attn.flash_attn_interface import flash_attn_func
 from flash_attn.flash_attn_interface import flash_attn_kvpacked_func

--- a/setup.py
+++ b/setup.py
@@ -36,6 +36,8 @@ FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
 SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
 # For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
 FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+# For CI, we want the option to not add "--threads 4" to nvcc, since the runner can OOM
+FORCE_SINGLE_THREAD = os.getenv("FLASH_ATTENTION_FORCE_SINGLE_THREAD", "FALSE") == "TRUE"
 def get_platform():
@@ -91,8 +93,7 @@ def raise_if_cuda_home_none(global_option: str) -> None:
 def append_nvcc_threads(nvcc_extra_args):
-    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    if not FORCE_SINGLE_THREAD:
-    if bare_metal_version >= Version("11.2"):
        return nvcc_extra_args + ["--threads", "4"]
    return nvcc_extra_args