Use generate_kernels.py script from Driss Guessous

7a983df7 · Tri Dao · c3f2a632 · 7a983df7 · 7a983df7 · 7a983df7
Commit 7a983df7 authored Aug 28, 2023 by Tri Dao
13 changed files
--- a/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//     });
-// }
-template<> void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim192<cutlass::bfloat16_t>(params, stream);
 }
--- a/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 32, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 32, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         // This one is slightly faster for causal?
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 64, 8, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 32, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 64, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 128, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 128, 8, false, elem_type>>(params, stream);
-//     });
-//     // For A100 H100, 1st is faster with dropout, 3rd is faster without dropout
-//     // For A6000, 1st is faster when causal, 3rd is faster when not causal
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim192<cutlass::half_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::bfloat16_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim224<cutlass::bfloat16_t>(params, stream);
 }
--- a/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::half_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::half_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim224<cutlass::half_t>(params, stream);
 }
--- a/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim256<cutlass::bfloat16_t>(params, stream);
 }
--- a/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim256<cutlass::half_t>(params, stream);
 }
--- a/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 32>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim32<cutlass::bfloat16_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<32, 128, 128, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         // For dropout there might be a lot of register spilling?
-//         // These two are very slow due to register spilling
-//         // run_flash_fwd<Flash_fwd_kernel_traits<32, 256, 128, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<32, 128, 256, 4, false, elem_type>>(params, stream);
-//         // This one is slightly slower
-//         // run_flash_fwd<Flash_fwd_kernel_traits<32, 256, 64, 4, false, elem_type>>(params, stream);
-//     });
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim32<cutlass::half_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     if (params.p_dropout == 1.f) {
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, false, elem_type>, false>(params, stream);
-//     } else {
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, false, false, elem_type>, true>(params, stream);
-//     }
-// }
 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim64<cutlass::bfloat16_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     if (params.p_dropout == 1.f) {
-//         // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower
-//         // Using block size (64 x 256) is 27% slower for seqlen=2k
-//         // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 128, 4, false, false, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, false, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, true, elem_type>, false>(params, stream);
-//     } else {
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, false, false, elem_type>, true>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, true, elem_type>, true>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, false, elem_type>, true>(params, stream);
-//     }
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim64<cutlass::half_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, true, false, elem_type>, Is_dropout>(params, stream);
-//     });
-// }
 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim96<cutlass::bfloat16_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, true, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, true, true, elem_type>, Is_dropout>(params, stream);
-//         // This 3rd one is good for H100, and A100, A6000
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, false, true, elem_type>, Is_dropout>(params, stream);
-//         // These two are always slower
-//         // run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 128, 4, true, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<96, 64, 128, 4, true, elem_type>>(params, stream);
-//     });
-// }
-template<> void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim96<cutlass::half_t>(params, stream);
-}
\ No newline at end of file
+}
--- a/csrc/flash_attn/src/generate_kernels.py
+++ b/csrc/flash_attn/src/generate_kernels.py
+# Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602
+
+# This file is run to generate the kernel instantiations for the flash_attn kernels
+# They are written to several files in order to speed up compilation
+
+import argparse
+import itertools
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+DTYPE_MAP = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+}
+
+SM = [80]  # Sm80 kernels support up to
+HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 224, 256]
+KERNEL_IMPL_TEMPLATE_FWD = """
+template<>
+void run_mha_fwd_<{DTYPE}, {HEAD_DIM}>(Flash_fwd_params &params, cudaStream_t stream) {{
+    run_mha_fwd_hdim{HEAD_DIM}<{DTYPE}>(params, stream);
+}}
+"""
+
+KERNEL_IMPL_TEMPLATE_BWD = """
+template<>
+void run_mha_bwd_<{DTYPE}, {HEAD_DIM}>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {{
+    run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}>(params, stream, configure);
+}}
+"""
+
+
+@dataclass
+class Kernel:
+    sm: int
+    dtype: str
+    head_dim: int
+    direction: str
+
+    @property
+    def template(self) -> str:
+        if self.direction == "fwd":
+            return KERNEL_IMPL_TEMPLATE_FWD.format(
+                DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim
+            )
+        else:
+            return KERNEL_IMPL_TEMPLATE_BWD.format(
+                DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim
+            )
+
+    @property
+    def filename(self) -> str:
+        return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_sm{self.sm}.cu"
+
+
+def get_all_kernels() -> List[Kernel]:
+    for dtype, head_dim, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, SM):
+        for direction in ["fwd", "bwd"]:
+            yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, direction=direction)
+
+
+def write_kernel(kernel: Kernel, autogen_dir: Path) -> None:
+    prelude = """// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"\n
+"""
+    include = f'#include "flash_{kernel.direction}_launch_template.h"\n'
+    (autogen_dir / kernel.filename).write_text(prelude + include + kernel.template)
+
+
+def main(output_dir: Optional[str]) -> None:
+    if output_dir is None:
+        output_dir = Path(__file__).parent
+    else:
+        output_dir = Path(output_dir)
+
+    for kernel in get_all_kernels():
+        write_kernel(kernel, output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate_kernels",
+        description="Generate the flash_attention kernels template instantiations",
+    )
+    # Set an optional output directory
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        required=False,
+        help="Where to generate the kernels "
+        " will default to the current directory ",
+    )
+    args = parser.parse_args()
+    main(args.output_dir)