Merge branch 'main' into Dao-AILab/main

26f4b5fb · Woosuk Kwon · 5018ac6a · 12375706 · 26f4b5fb · 26f4b5fb
Commit 26f4b5fb authored Jul 31, 2024 by Woosuk Kwon
20 changed files
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
-# This workflow will:
-# - Create a new Github release
-# - Build wheels for supported architectures
-# - Deploy the wheels to the Github release
-# - Release the static code to PyPi
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions

-name: Build wheels and deploy
+name: Create Release

 on:
-  create:
+  push:
    tags:
      - v*

-jobs:
+# Needed to create release and upload assets
+permissions:
+  contents: write

-  setup_release:
+jobs:
+  release:
+    # Retrieve tag and create release
    name: Create Release
    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
-      - name: Get the tag version
-        id: extract_branch
-        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV

      - name: Create Release
        id: create_release
-        uses: actions/create-release@v1
+        uses: "actions/github-script@v6"
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          RELEASE_TAG: ${{ env.release_tag }}
        with:
-          tag_name: ${{ steps.extract_branch.outputs.branch }}
-          release_name: ${{ steps.extract_branch.outputs.branch }}
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)

-  build_wheels:
+  wheel:
    name: Build Wheel
-    needs: setup_release
    runs-on: ${{ matrix.os }}
+    needs: release

    strategy:
      fail-fast: false
      matrix:
-          # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
-          # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
-          os: [ubuntu-20.04]
+          os: ['ubuntu-20.04']
          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0']
-          cuda-version: ['11.8.0', '12.3.2']
-          # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
-          # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
-          # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
-          # when building without C++11 ABI and using it on nvcr images.
-          cxx11_abi: ['FALSE', 'TRUE']
-          exclude:
-            # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
-            # Pytorch < 2.2 does not support Python 3.12
-            - torch-version: '2.0.1'
-              python-version: '3.12'
-            - torch-version: '2.1.2'
-              python-version: '3.12'
-            # Pytorch <= 2.0 only supports CUDA <= 11.8
-            - torch-version: '2.0.1'
-              cuda-version: '12.3.2'
+          pytorch-version: ['2.4.0']  # Should be synced with setup.py.
+          cuda-version: ['12.1', '11.8']

    steps:
      - name: Checkout
        uses: actions/checkout@v3

-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Set CUDA and PyTorch versions
-        run: |
-          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
-          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2

-      - name: Free up disk space
+      - name: Set up Linux Env
        if: ${{ runner.os == 'Linux' }}
-        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
-        # https://github.com/easimon/maximize-build-space/tree/test-report
        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          bash -x .github/workflows/scripts/env.sh

-      - name: Set up swap space
-        if: runner.os == 'Linux'
-        uses: pierotofy/set-swap-space@v1.0
+      - name: Set up Python
+        uses: actions/setup-python@v4
        with:
-          swap-size-gb: 10
+            python-version: ${{ matrix.python-version }}

      - name: Install CUDA ${{ matrix.cuda-version }}
-        if: ${{ matrix.cuda-version != 'cpu' }}
-        uses: Jimver/cuda-toolkit@v0.2.14
-        id: cuda-toolkit
-        with:
-          cuda: ${{ matrix.cuda-version }}
-          linux-local-args: '["--toolkit"]'
-          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
-          # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
-          method: 'network'
-          # We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions,
-          # not just nvcc
-          # sub-packages: '["nvcc"]'
-
-      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
        run: |
-          pip install --upgrade pip
-          # If we don't install before installing Pytorch, we get error for torch 2.0.1
-          # ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none)
-          pip install lit
-          # For some reason torch 2.2.0 on python 3.12 errors saying no setuptools
-          pip install setuptools
-          # We want to figure out the CUDA version to download pytorch
-          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
-          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
-          # This code is ugly, maybe there's a better way to do this.
-          export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
-            print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
-          )
-          if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
-            pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
-          else
-            pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
-          fi
-          nvcc --version
-          python --version
-          python -c "import torch; print('PyTorch:', torch.__version__)"
-          python -c "import torch; print('CUDA:', torch.version.cuda)"
-          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
-        shell:
-          bash
+          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}

-      - name: Build wheel
+      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
        run: |
-          # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
-          # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
-          # However this still fails so I'm using a newer version of setuptools
-          pip install setuptools==68.0.0
-          pip install ninja packaging wheel
-          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
-          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-          # Limit MAX_JOBS otherwise the github runner goes OOM
-          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.3 goes OOM
-          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
-          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
-          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
-
-      - name: Log Built Wheels
-        run: |
-          ls dist
-
-      - name: Get the tag version
-        id: extract_branch
-        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}

-      - name: Get Release with tag
-        id: get_current_release
-        uses: joutvhu/get-release@v1
-        with:
-          tag_name: ${{ steps.extract_branch.outputs.branch }}
+      - name: Build wheel
+        shell: bash
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+        run: |
+          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV

      - name: Upload Release Asset
-        id: upload_release_asset
        uses: actions/upload-release-asset@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
-          asset_path: ./dist/${{env.wheel_name}}
-          asset_name: ${{env.wheel_name}}
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
          asset_content_type: application/*
-
-  publish_package:
-    name: Publish package
-    needs: [build_wheels]
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-
-      - name: Install dependencies
-        run: |
-          pip install ninja packaging setuptools wheel twine
-          # We don't want to download anything CUDA-related here
-          pip install torch --index-url https://download.pytorch.org/whl/cpu
-
-      - name: Build core package
-        env:
-          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-        run: |
-          python setup.py sdist --dist-dir=dist
-
-      - name: Deploy
-        env:
-          TWINE_USERNAME: "__token__"
-          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
-        run: |
-          python -m twine upload dist/*
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
+#!/bin/bash
+
+python_executable=python$1
+cuda_home=/usr/local/cuda-$2
+
+# Update paths
+PATH=${cuda_home}/bin:$PATH
+LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
+
+# Install requirements
+$python_executable -m pip install wheel packaging
+
+# Limit the number of parallel jobs to avoid OOM
+export MAX_JOBS=1
+# Build
+$python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
+// Uses Github's API to create the release and wait for result.
+// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
+
+module.exports = async (github, context, core) => {
+	try {
+		const response = await github.rest.repos.createRelease({
+			draft: false,
+			generate_release_notes: true,
+			name: process.env.RELEASE_TAG,
+			owner: context.repo.owner,
+			prerelease: true,
+			repo: context.repo.repo,
+			tag_name: process.env.RELEASE_TAG,
+		});
+
+		core.setOutput('upload_url', response.data.upload_url);
+	} catch (error) {
+		core.setFailed(error.message);
+	}
+}
\ No newline at end of file
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
+#!/bin/bash
+
+# Replace '.' with '-' ex: 11.8 -> 11-8
+cuda_version=$(echo $1 | tr "." "-")
+# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
+OS=$(echo $2 | tr -d ".\-")
+
+# Installs CUDA
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+rm cuda-keyring_1.1-1_all.deb
+sudo apt -qq update
+sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt clean
+
+# Test nvcc
+PATH=/usr/local/cuda-$1/bin:${PATH}
+nvcc --version
+
+# Log gcc, g++, c++ versions
+gcc --version
+g++ --version
+c++ --version
--- a/.github/workflows/scripts/env.sh
+++ b/.github/workflows/scripts/env.sh
+#!/bin/bash
+
+# This file installs common linux environment tools
+
+export LANG C.UTF-8
+
+# python_version=$1
+
+sudo    apt-get update && \
+sudo    apt-get install -y --no-install-recommends \
+        software-properties-common \
+
+sudo    apt-get install -y --no-install-recommends \
+        build-essential \
+        apt-utils \
+        ca-certificates \
+        wget \
+        git \
+        vim \
+        libssl-dev \
+        curl \
+        unzip \
+        unrar \
+        cmake \
+        net-tools \
+        sudo \
+        autotools-dev \
+        rsync \
+        jq \
+        openssh-server \
+        tmux \
+        screen \
+        htop \
+        pdsh \
+        openssh-client \
+        lshw \
+        dmidecode \
+        util-linux \
+        automake \
+        autoconf \
+        libtool \
+        net-tools \
+        pciutils \
+        libpci-dev \
+        libaio-dev \
+        libcap2 \
+        libtinfo5 \
+        fakeroot \
+        devscripts \
+        debhelper \
+        nfs-common
+
+# Remove github bloat files to free up disk space
+sudo rm -rf "/usr/local/share/boost"
+sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+sudo rm -rf "/usr/share/dotnet"
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
+#!/bin/bash
+
+python_executable=python$1
+pytorch_version=$2
+cuda_version=$3
+
+# Install torch
+$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
+$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+
+# Print version information
+$python_executable --version
+$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
+$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
+$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
--- a/build.sh
+++ b/build.sh
+#!/bin/bash
+# A simple build script for local testing.
+# NOTE: This script is not used for the actual build process.
+
+PYTORCH_VERSION="2.4.0"
+
+pip install packaging ninja;
+pip install torch==${PYTORCH_VERSION};
+time python setup.py bdist_wheel --dist-dir=dist;
--- a/cutlass @ 751eb9a8
+++ b/cutlass @ 751eb9a8
-Subproject commit 756c351b4994854b2f8c6dded3821ebbb580876b
+Subproject commit 751eb9a8859ac36bfc77551f9e4a957c31a5a8b1
--- a/csrc/flash_attn/flash_api.cpp
+++ b/csrc/flash_attn/flash_api.cpp
@@ -603,7 +603,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
    const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
    const int num_blocks = !paged_KV ? 0 : k.size(0);
    const int page_block_size = !paged_KV ? 1 : k.size(1);
-    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    TORCH_CHECK(!paged_KV || page_block_size % 16 == 0, "Paged KV cache block size must be divisible by 16");

    if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }  // causal=true is the same as causal=false in this case
    if (is_causal) { window_size_right = 0; }
@@ -1353,13 +1353,15 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he

    const int batch_size = sizes[0];
    int seqlen_q = sizes[1];
+    const int seqlen_q_og = seqlen_q;
    int num_heads = sizes[2];
+    const int num_heads_og = num_heads;
    const int head_size_og = sizes[3];

    const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
    const int num_blocks = !paged_KV ? 0 : kcache.size(0);
    const int page_block_size = !paged_KV ? 1 : kcache.size(1);
-    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    TORCH_CHECK(!paged_KV || page_block_size % 16 == 0, "Paged KV cache block size must be divisible by 16");
    const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
    const int num_heads_k = kcache.size(2);
    const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
@@ -1411,8 +1413,12 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
        CHECK_DEVICE(out);
        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
-        CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og);
-        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+        CHECK_SHAPE(out, batch_size, seqlen_q_og, num_heads_og, head_size_og);
+        if (head_size_og % 8 != 0) {
+            out = torch::empty_like(q_padded);
+        } else if (seqlenq_ngroups_swapped) {
+            out = out.reshape({batch_size, num_heads, seqlen_q, head_size_og}).transpose(1, 2);
+        }
    } else {
        out = torch::empty_like(q_padded);
    }
@@ -1580,7 +1586,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.doc() = "FlashAttention";
    m.def("fwd", &mha_fwd, "Forward pass");
    m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)");
-    m.def("bwd", &mha_bwd, "Backward pass");
-    m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)");
+    // m.def("bwd", &mha_bwd, "Backward pass");
+    // m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)");
    m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache");
 }
--- a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 128, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 128, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::half_t, 128, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim128<cutlass::half_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::half_t, 128, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim128<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::half_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::half_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::half_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 192, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 192, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-
-#include "flash_bwd_launch_template.h"
-
-template<>
-void run_mha_bwd_<cutlass::half_t, 192, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim192<cutlass::half_t, true>(params, stream);
-}