first commit

34e67b1e · zhangshao · 34e67b1e · 34e67b1e · 34e67b1e · 34e67b1e
Commit 34e67b1e authored May 09, 2026 by zhangshao
20 changed files
--- a/csrc/flash_attn/flash_api_sparse.cpp
+++ b/csrc/flash_attn/flash_api_sparse.cpp
--- a/csrc/flash_attn/src/alibi.h
+++ b/csrc/flash_attn/src/alibi.h
+#include <cmath>
+#include <cute/tensor.hpp>
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include "utils.h"
+namespace flash {
+using namespace cute;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <bool Is_causal>
+struct Alibi {
+    const float alibi_slope;
+    const int max_seqlen_k, max_seqlen_q;
+    __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
+        : alibi_slope(alibi_slope)
+        , max_seqlen_k(max_seqlen_k)
+        , max_seqlen_q(max_seqlen_q) {
+    };
+    template <typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout> &tensor,
+                                      const int col_idx_offset_,
+                                      const int row_idx_offset,
+                                      const int warp_row_stride) {
+        // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+        static_assert(Layout::rank == 2, "Only support 2D Tensor");
+        const int lane_id = threadIdx.x % 64;
+        const int col_idx_offset = col_idx_offset_ + lane_id / 16;
+        const int stride_between_each_repeat = 16;
+        const int stride_between_each_thread = 4;
+        if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * stride_between_each_repeat;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j * stride_between_each_thread;
+                    #pragma unroll
+                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                        tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                    }
+                }
+            }
+        } else {  // Bias depends on both row_idx and col_idx
+            #pragma unroll
+            for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                const int row_idx = row_idx_offset + mi * warp_row_stride;
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * stride_between_each_repeat;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j * stride_between_each_thread;
+                        tensor(mi, make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                    }
+                }
+            }
+        }
+    }
+    template <typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_alibi_continuous(Tensor<Engine, Layout> &tensor,
+                                      const int col_idx_offset_,
+                                      const int row_idx_offset,
+                                      const int warp_row_stride) {
+        // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+        static_assert(Layout::rank == 2, "Only support 2D Tensor");
+        const int lane_id = threadIdx.x % 64;
+        const int col_idx_offset = col_idx_offset_ + (lane_id / 16) * 4;
+        const int stride_between_each_repeat = 16;
+        const int stride_between_each_thread = 1;
+        if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * stride_between_each_repeat;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j * stride_between_each_thread;
+                    #pragma unroll
+                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                        tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                    }
+                }
+            }
+        } else {  // Bias depends on both row_idx and col_idx
+            #pragma unroll
+            for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                const int row_idx = row_idx_offset + mi * warp_row_stride;
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * stride_between_each_repeat;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j * stride_between_each_thread;
+                        tensor(mi, make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                    }
+                }
+            }
+        }
+    }
+    template <typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_alibi_trans(Tensor<Engine, Layout> &tensor,
+                                      const int col_idx_offset_,
+                                      const int row_idx_offset,
+                                      const int warp_row_stride) {
+        // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+        static_assert(Layout::rank == 2, "Only support 2D Tensor");
+        const int lane_id = threadIdx.x % 64;
+        const int col_idx_offset = col_idx_offset_ + (lane_id / 16) * 4;
+        const int stride_between_each_repeat = 16;
+        const int stride_between_each_thread = 1;
+        if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * stride_between_each_repeat;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j * stride_between_each_thread;
+                    #pragma unroll
+                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                        const int row_idx = row_idx_offset + mi * warp_row_stride;
+                        tensor(mi, make_coord(j, nj)) += alibi_slope * row_idx;
+                    }
+                }
+            }
+        } else {  // Bias depends on both row_idx and col_idx
+            #pragma unroll
+            for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                const int row_idx = row_idx_offset + mi * warp_row_stride;
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * stride_between_each_repeat;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j * stride_between_each_thread;
+                        tensor(mi, make_coord(j, nj)) -= alibi_slope * abs(col_idx + max_seqlen_k - max_seqlen_q - row_idx);
+                    }
+                }
+            }
+        }
+    }
+};
+}  // namespace flash
--- a/csrc/flash_attn/src/block_info.h
+++ b/csrc/flash_attn/src/block_info.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+#pragma once
+namespace flash {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<bool Varlen=true>
+struct BlockInfo {
+    template<typename Params>
+    __device__ BlockInfo(const Params &params, const int bidb)
+        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
+        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
+        , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+        , leftpad_k(params.leftpad_k == nullptr ? 0 : params.leftpad_k[bidb])
+        , seqlen_k_cache((!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb])) - leftpad_k)
+        , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] - leftpad_k : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew))
+        {
+        }
+    template<typename Params>
+    __device__ BlockInfo(const Params &params, const int bidb, const bool padding_mask)
+        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
+        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
+        , actual_seqlen_q(params.padding_mask[bidb])
+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+        , leftpad_k(params.leftpad_k == nullptr ? 0 : params.leftpad_k[bidb])
+        , seqlen_k_cache((!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb])) - leftpad_k)
+        , actual_seqlen_k(params.padding_mask[bidb])
+        {
+        }
+    template <typename index_t>
+    __forceinline__ __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
+    }
+    template <typename index_t>
+    __forceinline__ __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_k == -1 ? bidb * batch_stride + leftpad_k * row_stride : uint32_t(sum_s_k + leftpad_k) * row_stride;
+    }
+    const int sum_s_q;
+    const int sum_s_k;
+    const int actual_seqlen_q;
+    // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0.
+    const int leftpad_k;
+    const int seqlen_k_cache;
+    const int actual_seqlen_k;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace flash
--- a/csrc/flash_attn/src/dropout.h
+++ b/csrc/flash_attn/src/dropout.h
--- a/csrc/flash_attn/src/flash.h
+++ b/csrc/flash_attn/src/flash.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+#pragma once
+#include <cuda.h>
+#include <vector>
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+#include <ATen/cuda/CUDAGraphsUtils.cuh> // For at::cuda::philox::unpack
+// get environment variables for internal usage
+static inline bool get_env_(const char *env_var) {
+  if (char *value = std::getenv(env_var)) {
+    if (strcmp(value, "0") == 0) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+static std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string raw_name(props.gcnArchName);
+    return raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
+}
+constexpr int TOTAL_DIM = 0;
+constexpr int H_DIM = 1;
+constexpr int D_DIM = 2;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Qkv_params {
+    using index_t = int64_t;
+    // The QKV matrices.
+    void *__restrict__ q_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ v_ptr;
+    // The stride between rows of the Q, K and V matrices.
+    index_t q_batch_stride;
+    index_t k_batch_stride;
+    index_t v_batch_stride;
+    index_t q_row_stride;
+    index_t k_row_stride;
+    index_t v_row_stride;
+    index_t q_head_stride;
+    index_t k_head_stride;
+    index_t v_head_stride;
+    // The number of heads.
+    int h, h_k;
+    // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be
+    // different from nheads (query).
+    int h_h_k_ratio; // precompute h / h_k,
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Flash_fwd_params : public Qkv_params {
+    // The O matrix (output).
+    void * __restrict__ o_ptr;
+    void * __restrict__ oaccum_ptr;
+    // The stride between rows of O.
+    index_t o_batch_stride;
+    index_t o_row_stride;
+    index_t o_head_stride;
+    // The pointer to the P matrix.
+    void * __restrict__ p_ptr;
+    // The pointer to the softmax sum.
+    void * __restrict__ softmax_lse_ptr;
+    void * __restrict__ softmax_lseaccum_ptr;
+    // The dimensions.
+    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q;
+    // The scaling factors for the kernel.
+    float scale_softmax;
+    float scale_softmax_log2;
+	// For FP8 scaling
+    float * __restrict__ q_descale_ptr;
+    float * __restrict__ k_descale_ptr;
+    float * __restrict__ v_descale_ptr;
+    index_t q_descale_batch_stride;
+    index_t q_descale_head_stride;
+    index_t k_descale_batch_stride;
+    index_t k_descale_head_stride;
+    index_t v_descale_batch_stride;
+    index_t v_descale_head_stride;
+    // array of length b+1 holding starting offset of each sequence.
+    int * __restrict__ cu_seqlens_q;
+    int * __restrict__ cu_seqlens_k;
+    int * __restrict__ leftpad_k;
+    int * __restrict__ padding_mask;
+    // If provided, the actual length of each k sequence.
+    int * __restrict__ seqused_k;
+    int *__restrict__ blockmask;
+    // The K_new and V_new matrices.
+    void * __restrict__ knew_ptr;
+    void * __restrict__ vnew_ptr;
+    // The stride between rows of the Q, K and V matrices.
+    index_t knew_batch_stride;
+    index_t vnew_batch_stride;
+    index_t knew_row_stride;
+    index_t vnew_row_stride;
+    index_t knew_head_stride;
+    index_t vnew_head_stride;
+    // The cos and sin matrices for rotary embedding.
+    void * __restrict__ rotary_cos_ptr;
+    void * __restrict__ rotary_sin_ptr;
+    // The indices to index into the KV cache.
+    int * __restrict__ cache_batch_idx;
+    // Paged KV cache
+    int * __restrict__ block_table;
+    index_t block_table_batch_stride;
+    int page_block_size;
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+    // uint32_t p_dropout_in_uint;
+    // uint16_t p_dropout_in_uint16_t;
+    uint8_t p_dropout_in_uint8_t;
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+    float scale_softmax_rp_dropout;
+    // Local window size
+    int window_size_left, window_size_right;
+    float softcap;
+    // Random state.
+    at::PhiloxCudaState philox_args;
+    // Pointer to the RNG seed (idx 0) and offset (idx 1).
+    uint64_t * rng_state;
+    bool is_bf16;
+	bool is_fp8;
+    bool is_e4m3;
+    bool is_causal;
+    bool is_vllm_kvcache;
+    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+    bool is_seqlens_k_cumulative;
+    bool is_rotary_interleaved;
+    int num_splits;  // For split-KV version
+    void * __restrict__ alibi_slopes_ptr;
+    index_t alibi_slopes_batch_stride;
+    bool unpadded_lse;  // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q].
+    bool seqlenq_ngroups_swapped;  // q has been transposed from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d).
+    // Attention Sinks: precomputed LogSumExp for sink tokens
+    // Shape: [nheads], dtype: float32 (ElementAccum). Maximum 64 heads supported (shared memory limit).
+    // Used for streaming LLM inference to maintain attention to initial "sink" tokens.
+    void * __restrict__ s_aux_ptr;
+    int d_value, d_value_rounded;
+    float skip_softmax_threshold_scale_factor;
+    void * skip_blocks_info_ptr;
+    void * __restrict__ debug_ptr; // for debug
+    void * __restrict__ qq_bias_ptr; 
+    int qq_bias_stride_0;
+    int * __restrict__ mm_prefix_range_ptr; 
+    int max_mm_ranges = 0; 
+    bool use_alibi_sqrt = false;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Flash_bwd_params : public Flash_fwd_params {
+    // The dO and dQKV matrices.
+    void *__restrict__ do_ptr;
+    void *__restrict__ dq_ptr;
+    void *__restrict__ dk_ptr;
+    void *__restrict__ dv_ptr;
+    // To accumulate dQ
+    void *__restrict__ dq_accum_ptr;
+    void *__restrict__ dk_accum_ptr;
+    void *__restrict__ dv_accum_ptr;
+    // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q
+    // dimension void *__restrict__ dk_accum_ptr; void *__restrict__
+    // dv_accum_ptr;
+    // The stride between rows of the dO, dQ, dK and dV matrices.
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    index_t do_batch_stride;
+    index_t do_row_stride;
+    index_t do_head_stride;
+    index_t dq_batch_stride;
+    index_t dk_batch_stride;
+    index_t dv_batch_stride;
+    index_t dq_row_stride;
+    index_t dk_row_stride;
+    index_t dv_row_stride;
+    index_t dq_head_stride;
+    index_t dk_head_stride;
+    index_t dv_head_stride;
+    // The pointer to the softmax d sum.
+    void *__restrict__ dsoftmax_sum;
+    bool deterministic;
+    index_t dq_accum_split_stride;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T, int Headdim, bool Is_causal> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, int HeaddimV, bool Is_causal> void run_mha_fwd_mla_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T,typename TO, int Headdim, bool Is_causal> void run_mha_fwd_splitkv_dispatch_fp8(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T,typename TO, int Headdim, bool Is_causal> void run_mha_fwd_splitkv_dispatch_kv_fp8(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_fwd_unified_dispatch(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_fwd_padding_mask_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_blasst_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+void run_mha_varlen_tiny_fwd_dim64(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, int HeaddimV, bool Is_causal> void run_mha_fwd_splitkv_mla_dispatch(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, int HeaddimV, bool Is_causal> void run_mha_bwd_mla_(Flash_bwd_params &params, cudaStream_t stream);
--- a/csrc/flash_attn/src/flash_attnmask.h
+++ b/csrc/flash_attn/src/flash_attnmask.h
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_bf16_causal_sm80.cu
+// Copyright (c) 2026, Attnmask extension.
+// Splitting the different head dimensions to different files to speed up compilation.
+#include "flash_bwd_attnmask_launch_template.h"
+template void run_mha_bwd_attnmask_<cutlass::bfloat16_t, 128, true>(
+    Flash_bwd_params_attnmask &params, cudaStream_t stream);
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_bf16_sm80.cu
+// Copyright (c) 2026, Attnmask extension.
+// Splitting the different head dimensions to different files to speed up compilation.
+#include "flash_bwd_attnmask_launch_template.h"
+template void run_mha_bwd_attnmask_<cutlass::bfloat16_t, 128, false>(
+    Flash_bwd_params_attnmask &params, cudaStream_t stream);
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_fp16_causal_sm80.cu
+// Copyright (c) 2026, Attnmask extension.
+// Splitting the different head dimensions to different files to speed up compilation.
+#include "flash_bwd_attnmask_launch_template.h"
+template void run_mha_bwd_attnmask_<cutlass::half_t, 128, true>(
+    Flash_bwd_params_attnmask &params, cudaStream_t stream);
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim128_fp16_sm80.cu
+// Copyright (c) 2026, Attnmask extension.
+// Splitting the different head dimensions to different files to speed up compilation.
+#include "flash_bwd_attnmask_launch_template.h"
+template void run_mha_bwd_attnmask_<cutlass::half_t, 128, false>(
+    Flash_bwd_params_attnmask &params, cudaStream_t stream);
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_bf16_causal_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_bf16_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_fp16_causal_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_hdim64_fp16_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_attnmask_kernel.h
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_kernel.h
--- a/csrc/flash_attn/src/flash_bwd_attnmask_launch_template.h
+++ b/csrc/flash_attn/src/flash_bwd_attnmask_launch_template.h
--- a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu
--- a/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu
+++ b/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu