Merge branch 'main' into Dao-AILab/main

26f4b5fb · Woosuk Kwon · 5018ac6a · 12375706 · 5018ac6a · 5018ac6a
Commit 26f4b5fb authored Jul 31, 2024 by Woosuk Kwon
20 changed files
--- a/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 192, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim192<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu
-// Copyright (c) 2023, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim224<cutlass::bfloat16_t>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu
-// Copyright (c) 2023, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim224<cutlass::half_t>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 256, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 256, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 256, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim256<cutlass::half_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 256, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim256<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 32, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 32, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 32, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim32<cutlass::half_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 32, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim32<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 64, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 64, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 64, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim64<cutlass::half_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 64, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim64<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 96, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 96, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_causal_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 96, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim96<cutlass::half_t, true>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "flash_bwd_launch_template.h"
-template<>
-void run_mha_bwd_<cutlass::half_t, 96, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim96<cutlass::half_t, false>(params, stream);
-}
--- a/csrc/flash_attn/src/flash_bwd_kernel.h
+++ b/csrc/flash_attn/src/flash_bwd_kernel.h