Fix wheel building

2ddeaa40 · Tri Dao · d8ec6a2f · d8ec6a2f · d8ec6a2f · d8ec6a2f
Commit 2ddeaa40 authored Aug 13, 2023 by Tri Dao
13 changed files
--- a/.github/workflows/cuda/cu102-Linux-env.sh
+++ b/.github/workflows/cuda/cu102-Linux-env.sh
-#!/bin/bash
-
-CUDA_HOME=/usr/local/cuda-10.2
-LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-PATH=${CUDA_HOME}/bin:${PATH}
-
-export FORCE_CUDA=1
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
-export CUDA_HOME=/usr/local/cuda-10.2
\ No newline at end of file
--- a/.github/workflows/cuda/cu102-Linux.sh
+++ b/.github/workflows/cuda/cu102-Linux.sh
-#!/bin/bash
-
-# Strip the periods from the version number
-OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
-OS=ubuntu${OS_VERSION}
-
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
-sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget -nv https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
-sudo dpkg -i cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
-sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub
-
-sudo apt-get -qq update
-sudo apt install cuda cuda-nvcc-10-2 cuda-libraries-dev-10-2
-sudo apt clean
-
-rm -f https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
\ No newline at end of file
--- a/.github/workflows/cuda/cu113-Linux-env.sh
+++ b/.github/workflows/cuda/cu113-Linux-env.sh
-#!/bin/bash
-
-CUDA_HOME=/usr/local/cuda-11.3
-LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-PATH=${CUDA_HOME}/bin:${PATH}
-
-export FORCE_CUDA=1
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-export CUDA_HOME=/usr/local/cuda-11.3
\ No newline at end of file
--- a/.github/workflows/cuda/cu113-Linux.sh
+++ b/.github/workflows/cuda/cu113-Linux.sh
-#!/bin/bash
-
-# Strip the periods from the version number
-OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
-OS=ubuntu${OS_VERSION}
-
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
-sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget -nv https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
-sudo dpkg -i cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
-
-# TODO: If on version < 22.04, install via signal-desktop-keyring
-# For future versions it's deprecated and should be moved into the trusted folder
-# sudo mv /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub /etc/apt/trusted.gpg.d/
-sudo apt-key add /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub
-
-sudo apt-get -qq update
-sudo apt install cuda cuda-nvcc-11-3 cuda-libraries-dev-11-3
-sudo apt clean
-
-rm -f https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
\ No newline at end of file
--- a/.github/workflows/cuda/cu116-Linux-env.sh
+++ b/.github/workflows/cuda/cu116-Linux-env.sh
-#!/bin/bash
-
-CUDA_HOME=/usr/local/cuda-11.6
-LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-PATH=${CUDA_HOME}/bin:${PATH}
-
-export FORCE_CUDA=1
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-export CUDA_HOME=/usr/local/cuda-11.6
\ No newline at end of file
--- a/.github/workflows/cuda/cu116-Linux.sh
+++ b/.github/workflows/cuda/cu116-Linux.sh
-#!/bin/bash
-
-# Strip the periods from the version number
-OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
-OS=ubuntu${OS_VERSION}
-
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
-sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
-
-sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
-sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub
-
-sudo apt-get -qq update
-sudo apt install cuda cuda-nvcc-11-6 cuda-libraries-dev-11-6
-sudo apt clean
-
-rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
\ No newline at end of file
--- a/.github/workflows/cuda/cu117-Linux-env.sh
+++ b/.github/workflows/cuda/cu117-Linux-env.sh
-#!/bin/bash
-
-CUDA_HOME=/usr/local/cuda-11.7
-LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-PATH=${CUDA_HOME}/bin:${PATH}
-
-export FORCE_CUDA=1
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-export CUDA_HOME=/usr/local/cuda-11.7
\ No newline at end of file
--- a/.github/workflows/cuda/cu117-Linux.sh
+++ b/.github/workflows/cuda/cu117-Linux.sh
-#!/bin/bash
-
-# Strip the periods from the version number
-OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
-OS=ubuntu${OS_VERSION}
-
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
-sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
-
-sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
-sudo cp /var/cuda-repo-${OS}-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
-
-sudo apt-get -qq update
-sudo apt install cuda cuda-nvcc-11-7 cuda-libraries-dev-11-7
-sudo apt clean
-
-rm -f https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
--- a/.github/workflows/cuda/cu120-Linux-env.sh
+++ b/.github/workflows/cuda/cu120-Linux-env.sh
-#!/bin/bash
-
-CUDA_HOME=/usr/local/cuda-12.0
-LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-PATH=${CUDA_HOME}/bin:${PATH}
-
-export FORCE_CUDA=1
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-export CUDA_HOME=/usr/local/cuda-12.0
\ No newline at end of file
--- a/.github/workflows/cuda/cu120-Linux.sh
+++ b/.github/workflows/cuda/cu120-Linux.sh
-#!/bin/bash
-
-# Strip the periods from the version number
-OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
-OS=ubuntu${OS_VERSION}
-
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
-sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget -nv https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-${OS}-12-0-local_12.0.0-525.60.13-1_amd64.deb
-
-sudo dpkg -i cuda-repo-${OS}-12-0-local_12.0.0-525.60.13-1_amd64.deb
-sudo cp /var/cuda-repo-${OS}-12-0-local/cuda-*-keyring.gpg /usr/share/keyrings/
-
-sudo apt-get -qq update
-sudo apt install cuda cuda-nvcc-12-0 cuda-libraries-dev-12-0
-sudo apt clean
-
-rm -f https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-${OS}-12-0-local_12.0.0-525.60.13-1_amd64.deb
--- a/.github/workflows/env.sh
+++ b/.github/workflows/env.sh
-export LANG C.UTF-8
-export OFED_VERSION=5.3-1.0.0.1
-
-
-sudo    apt-get update && \
-sudo    apt-get install -y --no-install-recommends \
-        software-properties-common \
-
-sudo    apt-get install -y --no-install-recommends \
-        build-essential \
-        apt-utils \
-        ca-certificates \
-        wget \
-        git \
-        vim \
-        libssl-dev \
-        curl \
-        unzip \
-        unrar \
-        cmake \
-        net-tools \
-        sudo \
-        autotools-dev \
-        rsync \
-        jq \
-        openssh-server \
-        tmux \
-        screen \
-        htop \
-        pdsh \
-        openssh-client \
-        lshw \
-        dmidecode \
-        util-linux \
-        automake \
-        autoconf \
-        libtool \
-        net-tools \
-        pciutils \
-        libpci-dev \
-        libaio-dev \
-        libcap2 \
-        libtinfo5 \
-        fakeroot \
-        devscripts \
-        debhelper \
-        nfs-common
-
-# wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-#      chmod +x ~/miniconda.sh && \
-#      ~/miniconda.sh -b -p /opt/conda && \
-#      rm ~/miniconda.sh
-# export PATH=/opt/conda/bin:$PATH
\ No newline at end of file
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -7,116 +7,120 @@

 name: Build wheels and deploy

-#on:
-#  create:
-#    tags:
-#      - '**'
 on:
-  push
+  create:
+    tags:
+      - v*

 jobs:
-  # setup_release:
-  #   name: Create Release
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Get the tag version
-  #       id: extract_branch
-  #       run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
-  #       shell: bash
-
-  #     - name: Create Release
-  #       id: create_release
-  #       uses: actions/create-release@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         tag_name: ${{ steps.extract_branch.outputs.branch }}
-  #         release_name: ${{ steps.extract_branch.outputs.branch }}
+
+  setup_release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get the tag version
+        id: extract_branch
+        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+        shell: bash
+
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+          release_name: ${{ steps.extract_branch.outputs.branch }}

  build_wheels:
    name: Build Wheel
+    needs: setup_release
    runs-on: ${{ matrix.os }}
-    #needs: setup_release

    strategy:
      fail-fast: false
      matrix:
-          os: [ubuntu-20.04, ubuntu-22.04]
-          #python-version: ['3.7', '3.8', '3.9', '3.10']
-          #torch-version: ['1.11.0', '1.12.0', '1.13.0', '2.0.1']
-          #cuda-version: ['113', '116', '117', '120']
-          python-version: ['3.10']
-          torch-version: ['2.0.1']
-          cuda-version: ['120']
+          # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
+          # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
+          os: [ubuntu-20.04]
+          python-version: ['3.7', '3.8', '3.9', '3.10']
+          torch-version: ['1.12.1', '1.13.1', '2.0.1', '2.1.0.dev20230731']
+          cuda-version: ['11.6.2', '11.7.1', '11.8.0', '12.1.0']
+          # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
+          # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
+          # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
+          # when building without C++11 ABI and using it on nvcr images.
+          cxx11_abi: ['FALSE', 'TRUE']
          exclude:
-            # Nvidia only supports 11.7+ for ubuntu-22.04
-            - os: ubuntu-22.04
-              cuda-version: '116'
-            - os: ubuntu-22.04
-              cuda-version: '113'
-            # Torch only builds cuda 117 for 1.13.0+
-            - cuda-version: '117'
-              torch-version: '1.11.0'
-            - cuda-version: '117'
-              torch-version: '1.12.0'
-            # Torch only builds cuda 116 for 1.12.0+
-            - cuda-version: '116'
-              torch-version: '1.11.0'
-            # Torch only builds cuda 120 for 2.0.1+
-            - cuda-version: '120'
-              torch-version: '1.11.0'
-            - cuda-version: '120'
-              torch-version: '1.12.0'
-            - cuda-version: '120'
-              torch-version: '1.13.0'
-            # 1.13.0 drops support for cuda 11.3
-            - cuda-version: '113'
-              torch-version: '1.13.0'
-            - cuda-version: '113'
-              torch-version: '2.0.1'
-            # Fails with "Validation Error" on artifact upload
-            - cuda-version: '117'
-              torch-version: '1.13.0'
-              os: ubuntu-20.04
+            # Pytorch >= 2.0 only supports Python >= 3.8
+            - torch-version: '2.0.1'
+              python-version: '3.7'
+            - torch-version: '2.1.0.dev20230731'
+              python-version: '3.7'
+            # Pytorch <= 2.0 only supports CUDA <= 11.8
+            - torch-version: '1.12.1'
+              cuda-version: '12.1.0'
+            - torch-version: '1.13.1'
+              cuda-version: '12.1.0'
+            - torch-version: '2.0.1'
+              cuda-version: '12.1.0'
+            # Pytorch >= 2.1 only supports CUDA 12.1
+            - torch-version: '2.1.0.dev20230731'
+              cuda-version: '11.6.2'
+            - torch-version: '2.1.0.dev20230731'
+              cuda-version: '11.7.1'
+            - torch-version: '2.1.0.dev20230731'
+              cuda-version: '11.8.0'

    steps:
      - name: Checkout
        uses: actions/checkout@v3
-      
+
      - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}

-      - name: Set up Linux Env
+      - name: Set CUDA and PyTorch versions
+        run: |
+          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+
+      - name: Free up disk space
        if: ${{ runner.os == 'Linux' }}
        run: |
          sudo rm -rf /usr/share/dotnet
-          bash .github/workflows/env.sh
-          echo ${{ needs.create_release.outputs.upload_url }} 
-          echo ${{ needs.steps.extract_branch.outputs.upload_url }} 
-        shell:
-          bash

      - name: Install CUDA ${{ matrix.cuda-version }}
        if: ${{ matrix.cuda-version != 'cpu' }}
-        run: |
-          bash .github/workflows/cuda/cu${{ matrix.cuda-version }}-${{ runner.os }}.sh
-        shell:
-          bash
-
-      - name: Check GPU Env
-        if: ${{ matrix.cuda-version != 'cpu' }}
-        run: |
-          source .github/workflows/cuda/cu${{ matrix.cuda-version }}-${{ runner.os }}-env.sh
-          nvcc --version
-        shell:
-          bash
+        uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
+          # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
+          method: 'network'
+          # We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions,
+          # not just nvcc
+          # sub-packages: '["nvcc"]'

      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
        run: |
-          pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
-          pip install --no-cache-dir torch==${{ matrix.torch-version }}
+          pip install --upgrade pip
+          # If we don't install before installing Pytorch, we get error for torch 2.0.1
+          # ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none)
+          pip install lit
+          # We want to figure out the CUDA version to download pytorch
+          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
+          # This code is ugly, maybe there's a better way to do this.
+          export TORCH_CUDA_VERSION=$(python -c "import os; minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 121}[os.environ['MATRIX_TORCH_VERSION']]; maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121}[os.environ['MATRIX_TORCH_VERSION']]; print(max(min(int(os.environ['MATRIX_CUDA_VERSION']), maxv), minv))")
+          if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
+            pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+          else
+            pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
+          fi
+          nvcc --version
          python --version
          python -c "import torch; print('PyTorch:', torch.__version__)"
          python -c "import torch; print('CUDA:', torch.version.cuda)"
@@ -124,17 +128,26 @@ jobs:
        shell:
          bash

-      # - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
-      #   run: |
-      #     pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
-      #     pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html
-      #     python --version
-      #     python -c "import torch; print('PyTorch:', torch.__version__)"
-      #     python -c "import torch; print('CUDA:', torch.version.cuda)"
-      #     python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
-      #   shell:
-      #     bash
-      
+      - name: Build wheel
+        run: |
+          # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
+          # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
+          # However this still fails so I'm using a newer version of setuptools
+          pip install setuptools==68.0.0
+          pip install ninja packaging wheel
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          # Limit MAX_JOBS otherwise the github runner goes OOM
+          MAX_JOBS=1 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
+          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+
+      - name: Log Built Wheels
+        run: |
+          ls dist
+
      - name: Get the tag version
        id: extract_branch
        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
@@ -147,62 +160,45 @@ jobs:
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

-      - name: Build wheel
+      - name: Upload Release Asset
+        id: upload_release_asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
+          asset_path: ./dist/${{env.wheel_name}}
+          asset_name: ${{env.wheel_name}}
+          asset_content_type: application/*
+
+  publish_package:
+    name: Publish package
+    needs: [build_wheels]
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
        run: |
-          export FLASH_ATTENTION_FORCE_BUILD="TRUE"
-          export FORCE_CUDA="1"
-          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
-          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-          export CUDA_INSTALL_DIR=/usr/local/cuda-11.3$CUDA_INSTALL_DIR
-          pip install ninja packaging setuptools wheel
-          python setup.py bdist_wheel --dist-dir=dist
-          tmpname=cu${{ matrix.cuda-version }}torch${{ matrix.torch-version }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
-          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          pip install ninja packaging setuptools wheel twine
+          # We don't want to download anything CUDA-related here
+          pip install torch --index-url https://download.pytorch.org/whl/cpu

-      - name: Log Built Wheels
+      - name: Build core package
+        env:
+          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
        run: |
-          ls dist
+          python setup.py sdist --dist-dir=dist

-      # - name: Upload Release Asset
-      #   id: upload_release_asset 
-      #   uses: actions/upload-release-asset@v1
-      #   env:
-      #     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      #   with:
-      #     upload_url: ${{ steps.get_current_release.outputs.upload_url }}
-      #     asset_path: ./dist/${{env.wheel_name}}
-      #     asset_name: ${{env.wheel_name}}
-      #     asset_content_type: application/*
-
-  # publish_package:
-  #   name: Publish package
-  #   needs: [build_wheels]
-
-  #   runs-on: ubuntu-latest
-
-  #   steps:
-  #     - uses: actions/checkout@v3
-
-  #     - uses: actions/setup-python@v4
-  #       with:
-  #         python-version: '3.10'
-
-  #     - name: Install dependencies
-  #       run: |
-  #         pip install ninja packaging setuptools wheel twine
-  #         pip install torch
-
-  #     - name: Build core package
-  #       env:
-  #         FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-  #       run: |
-  #         python setup.py sdist --dist-dir=dist
-
-  #     - name: Deploy
-  #       env:
-  #         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-  #         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-  #       run: |
-  #         python -m twine upload dist/*
+      - name: Deploy
+        env:
+          TWINE_USERNAME: "__token__"
+          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+        run: |
+          python -m twine upload dist/*
--- a/setup.py
+++ b/setup.py
@@ -13,9 +13,10 @@ import subprocess

 import urllib.request
 import urllib.error
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
 import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
-from wheel.bdist_wheel import bdist_wheel as _bdist_wheel


 with open("README.md", "r", encoding="utf-8") as fh:
@@ -33,6 +34,8 @@ BASE_WHEEL_URL = "https://github.com/Dao-AILab/flash-attention/releases/download
 # SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
 FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
 SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE"


 def get_platform():
@@ -101,26 +104,27 @@ if not torch.cuda.is_available():
    print(
        "\nWarning: Torch did not find available GPUs on this system.\n",
        "If your intention is to cross-compile, this is not an error.\n"
-        "By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n"
-        "Volta (compute capability 7.0), Turing (compute capability 7.5),\n"
-        "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n"
+        "By default, FlashAttention will cross-compile for Ampere (compute capability 8.0, 8.6, "
+        "8.9), and, if the CUDA version is >= 11.8, Hopper (compute capability 9.0).\n"
        "If you wish to cross-compile for a single specific architecture,\n"
        'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n',
    )
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None and CUDA_HOME is not None:
        _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
        if bare_metal_version >= Version("11.8"):
-            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0"
-        elif bare_metal_version >= Version("11.1"):
-            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6"
-        elif bare_metal_version == Version("11.0"):
-            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6;9.0"
+        elif bare_metal_version >= Version("11.4"):
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
        else:
-            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"

 cmdclass = {}
 ext_modules = []

+# We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp
+# files included in the source distribution, in case the user compiles from source.
+subprocess.run(["git", "submodule", "update", "--init", "csrc/cutlass"])
+
 if not SKIP_CUDA_BUILD:
    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
    TORCH_MAJOR = int(torch.__version__.split(".")[0])
@@ -137,8 +141,8 @@ if not SKIP_CUDA_BUILD:
    # Check, if CUDA11 is installed for compute capability 8.0
    cc_flag = []
    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
-    if bare_metal_version < Version("11.0"):
-        raise RuntimeError("FlashAttention is only supported on CUDA 11 and above")
+    if bare_metal_version < Version("11.4"):
+        raise RuntimeError("FlashAttention is only supported on CUDA 11.4 and above")
    # cc_flag.append("-gencode")
    # cc_flag.append("arch=compute_75,code=sm_75")
    cc_flag.append("-gencode")
@@ -147,7 +151,11 @@ if not SKIP_CUDA_BUILD:
        cc_flag.append("-gencode")
        cc_flag.append("arch=compute_90,code=sm_90")

-    subprocess.run(["git", "submodule", "update", "--init", "csrc/cutlass"])
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
    ext_modules.append(
        CUDAExtension(
            name="flash_attn_2_cuda",
@@ -213,6 +221,7 @@ if not SKIP_CUDA_BUILD:
                Path(this_dir) / 'csrc' / 'cutlass' / 'include',
            ],
        )
+    )


 def get_package_version():
@@ -227,30 +236,33 @@ def get_package_version():


 class CachedWheelsCommand(_bdist_wheel):
-     """
-     The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
-     find an existing wheel (which is currently the case for all flash attention installs). We use
-     the environment parameters to detect whether there is already a pre-built version of a compatible
-     wheel available and short-circuits the standard full build pipeline.
-
-     """
-     def run(self):
+    """
+    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+    find an existing wheel (which is currently the case for all flash attention installs). We use
+    the environment parameters to detect whether there is already a pre-built version of a compatible
+    wheel available and short-circuits the standard full build pipeline.
+    """
+    def run(self):
        if FORCE_BUILD:
            return super().run()

        raise_if_cuda_home_none("flash_attn")

        # Determine the version numbers that will be used to determine the correct wheel
-        _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+        # We're using the CUDA version used to build torch, not the one currently installed
+        # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+        torch_cuda_version = parse(torch.version.cuda)
        torch_version_raw = parse(torch.__version__)
        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
        platform_name = get_platform()
        flash_version = get_package_version()
-        cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
-        torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}.{torch_version_raw.micro}"
+        # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
+        cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
+        torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}"
+        cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()

        # Determine wheel URL based on CUDA version, torch version, python version and OS
-        wheel_filename = f'{PACKAGE_NAME}-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
+        wheel_filename = f'{PACKAGE_NAME}-{flash_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl'
        wheel_url = BASE_WHEEL_URL.format(
            tag_name=f"v{flash_version}",
            wheel_name=wheel_filename
@@ -279,7 +291,6 @@ class CachedWheelsCommand(_bdist_wheel):


 setup(
-    # @pierce - TODO: Revert for official release
    name=PACKAGE_NAME,
    version=get_package_version(),
    packages=find_packages(