Merge branch 'main' into Dao-AILab/main

26f4b5fb · Woosuk Kwon · 5018ac6a · 12375706 · 5018ac6a · 5018ac6a
Commit 26f4b5fb authored Jul 31, 2024 by Woosuk Kwon
20 changed files
--- a/csrc/flash_attn/src/flash_bwd_launch_template.h
+++ b/csrc/flash_attn/src/flash_bwd_launch_template.h
-/******************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include "static_switch.h"
-#include "flash.h"
-#include "flash_bwd_preprocess_kernel.h"
-#include "flash_bwd_kernel.h"
-
-// Determine if the architecture supports FLASH and define a macro to handle parameter modifiers
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-#define ARCH_SUPPORTS_FLASH
-#define KERNEL_PARAM_MODIFIER __grid_constant__
-#else
-#define KERNEL_PARAM_MODIFIER
-#endif
-
-// Define a macro for unsupported architecture handling to centralize the error message
-#define FLASH_UNSUPPORTED_ARCH printf("FATAL: FlashAttention requires building with sm version sm80-sm90, but was built for < 8.0!");
-
-// Use a macro to clean up kernel definitions
-#define DEFINE_FLASH_BACKWARD_KERNEL(kernelName, ...) \
-template<typename Kernel_traits, __VA_ARGS__> \
-__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_bwd_params params)
-
-DEFINE_FLASH_BACKWARD_KERNEL(flash_bwd_dq_dk_dv_loop_kernel, bool Is_dropout, bool Is_causal, bool Has_alibi, bool Is_even_M, bool Is_even_K) {
-    #if defined(ARCH_SUPPORTS_FLASH)
-       flash::compute_dq_dk_dv<Kernel_traits, Is_dropout, Is_causal, Has_alibi, Is_even_M, Is_even_K>(params);
-    #else
-        FLASH_UNSUPPORTED_ARCH
-    #endif
-}
-
-DEFINE_FLASH_BACKWARD_KERNEL(flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap) {
-    #if defined(ARCH_SUPPORTS_FLASH)
-        static_assert(!(Is_causal && Is_local));  // If Is_local is true, Is_causal should be false
-        flash::compute_dq_dk_dv_seqk_parallel<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Is_softcap>(params);
-    #else
-        FLASH_UNSUPPORTED_ARCH
-    #endif
-}
-
-
-template<bool Clear_dQaccum=true, typename Kernel_traits>
-__global__ void flash_bwd_dot_do_o_kernel(const Flash_bwd_params params) {
-    flash::compute_dot_do_o<Clear_dQaccum, Kernel_traits>(params);
-}
-
-template<typename Kernel_traits>
-__global__ void flash_bwd_clear_dkvaccum_kernel(const Flash_bwd_params params) {
-    flash::clear_dKVaccum<Kernel_traits>(params);
-}
-
-template<typename Kernel_traits>
-__global__ void flash_bwd_convert_dq_kernel(const Flash_bwd_params params, const int nsplits) {
-    flash::convert_dQ<Kernel_traits>(params, nsplits);
-}
-
-template<typename Kernel_traits>
-__global__ void flash_bwd_convert_dkv_kernel(const Flash_bwd_params params) {
-    flash::convert_dKV<Kernel_traits>(params);
-}
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
-void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream) {
-    const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
-    dim3 grid_m(num_m_block, params.b, params.h);
-    const int num_n_block = (params.seqlen_k + Kernel_traits::kBlockN - 1) / Kernel_traits::kBlockN;
-    int gridDimx = num_n_block;
-    if (params.deterministic) {
-        auto dprops = at::cuda::getCurrentDeviceProperties();
-        gridDimx = (dprops->multiProcessorCount + params.b * params.h - 1) / (params.b * params.h);
-    }
-    dim3 grid_n(gridDimx, params.b, params.h);
-
-    if (!params.deterministic) {
-        flash_bwd_dot_do_o_kernel<true, Kernel_traits><<<grid_m, Kernel_traits::kNThreads, 0, stream>>>(params);
-    } else {
-        flash_bwd_dot_do_o_kernel<false, Kernel_traits><<<grid_m, Kernel_traits::kNThreads, 0, stream>>>(params);
-    }
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-    // We want to specialize to is_even_MN and not just is_even_M, since in the case where N is not
-    // a multiple of kBlockN, we'll need to apply mask in the loop.
-    const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_q % Kernel_traits::kBlockM == 0 && params.seqlen_k % Kernel_traits::kBlockN == 0;
-    const bool is_even_K = params.d == Kernel_traits::kHeadDim;
-    constexpr int smem_size_dq_dk_dv = Kernel_traits::kSmemSize1colblock;
-    // printf("smem_size_dq_dk_dv = %d\n", smem_size_dq_dk_dv);
-    BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
-        EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
-            LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !params.is_causal, Is_local, [&] {
-                ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
-                    SOFTCAP_SWITCH(params.softcap > 0.0, Is_softcap, [&] {
-                        // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
-                        // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
-                        // If Is_local, set Is_causal to false
-                        auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap>;
-                        // auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, false, Is_causal, false, false, true, true>;
-                        if (smem_size_dq_dk_dv >= 48 * 1024)  {
-                            C10_CUDA_CHECK(cudaFuncSetAttribute(
-                                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
-                        }
-                        kernel<<<grid_n, Kernel_traits::kNThreads, smem_size_dq_dk_dv, stream>>>(params);
-                        C10_CUDA_KERNEL_LAUNCH_CHECK();
-                    });
-                });
-            });
-        });
-    });
-
-    auto kernel_dq = &flash_bwd_convert_dq_kernel<Kernel_traits>;
-    if (Kernel_traits::kSmemdQSize >= 48 * 1024)  {
-        C10_CUDA_CHECK(cudaFuncSetAttribute(
-            kernel_dq, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::kSmemdQSize));
-    }
-    kernel_dq<<<grid_m, Kernel_traits::kNThreads, Kernel_traits::kSmemdQSize, stream>>>(params, !params.deterministic ? 1 : gridDimx);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
-void run_flash_bwd(Flash_bwd_params &params, cudaStream_t stream) {
-#ifndef FLASHATTENTION_DISABLE_BACKWARD
-    run_flash_bwd_seqk_parallel<Kernel_traits, Is_dropout, Is_causal>(params, stream);
-#endif
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim32(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 32;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB
-            if constexpr(!Is_dropout) {  // We can afford more registers to keep V in registers
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
-            } else {
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-            }
-        } else {  // 96 KB
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
-        }
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim64(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 64;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        // Changing AtomLayoutMdQ from 2 to 4 takes the same time
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, false, false, T>>(params, stream);
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, true, false, T>>(params, stream);
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 2, 4, 4, false, false, T>>(params, stream);
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream);
-        // This is slightly faster. We want to split M more so we need fewer registers to store LSE.
-        if (max_smem_per_block >= 144 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-            // This has a lot of register spilling
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream);
-        } else {
-            // if (params.h == params.h_k) {
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>, Is_dropout>(params, stream);
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, true, false, T>, Is_dropout>(params, stream);
-            // } else {
-            // }
-        }
-    });
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, true, false, T>>(params, stream);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 4, 2, 2, 2, true, false, T>>(params, stream);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 32, 128, 4, 1, 4, 1, false, false, T>>(params, stream);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 16, 128, 4, 1, 4, 1, false, false, T>>(params, stream);
-    // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 2, 2, 2, false, T>>(params, stream);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, false, T>>(params, stream);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 4, false, T>>(params, stream);
-
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 4, 4, 2, 4, false, false, T>>(params, stream);
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim96(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 96;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        if (max_smem_per_block >= 116 * 1024) {
-            if constexpr(!Is_dropout) {  // 92KB
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
-            } else {  // 116 KB
-                // This is faster for dropout since we don't have many registers to spare
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-            }
-        } else {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
-        }
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 128;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 32, 128, 8, 2, 2, 2, false, false, T>>(params, stream);
-        // This is faster, in the case of sequence-parallel bwd (where we need fewer registers).
-        // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why.
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 2, 2, false, false, T>>(params, stream);
-        if (max_smem_per_block >= 144 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, false, false, T>, Is_dropout, Is_causal>(params, stream);
-            // run_flash_bwd_seqk_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream);
-            // run_flash_bwd_seqk_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, true, T>, Is_dropout>(params, stream);
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, true, false, T>, Is_dropout>(params, stream);
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream);
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 2, true, false, T>, Is_dropout>(params, stream);
-        } else {
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream);
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, true, false, T>, Is_dropout, Is_causal>(params, stream);
-        }
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>>(params, stream);
-
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 4, 4, false, false, T>>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim160(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 160;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        if (max_smem_per_block >= 116 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-        } else {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, true, T>, Is_dropout, Is_causal>(params, stream);
-        }
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 192;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        if (max_smem_per_block >= 136 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout, Is_causal>(params, stream);
-        } else {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, true, true, T>, Is_dropout, Is_causal>(params, stream);
-        }
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim256(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 256;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        if (max_smem_per_block >= 176 * 1024) {  // H100
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout, Is_causal>(params, stream);
-        } else if (max_smem_per_block >= 144 * 1024) {  // A100, we don't do double buffering to save smem
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, true, T>, Is_dropout, Is_causal>(params, stream);
-        } else { // sm86 and sm89, max smem is 99 KB. Only works without dropout. V in regs and no double buffering.
-            if constexpr (!Is_dropout) {
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 32, 8, 4, 1, 2, true, true, T>, false, Is_causal>(params, stream);
-            }
-        }
-    });
-}
--- a/csrc/flash_attn/src/flash_bwd_preprocess_kernel.h
+++ b/csrc/flash_attn/src/flash_bwd_preprocess_kernel.h
-/***************************************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cute/tensor.hpp>
-
-#include <cutlass/cutlass.h>
-#include <cutlass/array.h>
-#include <cutlass/numeric_types.h>
-
-#include "block_info.h"
-#include "kernel_traits.h"
-#include "utils.h"
-
-namespace flash {
-
-using namespace cute;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int THREADS_PER_ROW, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-inline __device__ void dot_do_o(Tensor<Engine0, Layout0> const &do_, Tensor<Engine0, Layout0> const &o,
-                                Tensor<Engine1, Layout1> &dP_sum, const int gdP_col_stride, const float scale) {
-    static_assert(Layout0::rank == 3, "Only support 3D Tensor");
-    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
-    CUTE_STATIC_ASSERT_V(do_.layout() == o.layout());
-    // Reshape do_ and o from (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32, 8 * kHeadDim / 64)
-    // The last coordinate is the "page".
-    Tensor do_reshaped = make_tensor(do_.data(), make_layout(get<1>(do_.layout()),
-                                                             make_layout(get<0>(do_.layout()),
-                                                                         get<2>(do_.layout()))));
-    Tensor o_reshaped = make_tensor(o.data(), do_reshaped.layout());
-    Tensor do_fp32 = flash::convert_type<float>(do_reshaped);
-    Tensor o_fp32 = flash::convert_type<float>(o_reshaped);
-    #pragma unroll
-    for (int mi = 0; mi < size<0>(do_reshaped); ++mi) {
-        float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
-        #pragma unroll
-        for (int ni = 1; ni < size<1>(do_reshaped); ni++) {
-            dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
-        }
-        flash::SumOp<float> sum_op;
-        dP_sum_cur = flash::Allreduce<THREADS_PER_ROW>::run(dP_sum_cur, sum_op) * scale;
-        if (threadIdx.x % THREADS_PER_ROW == 0) {
-            dP_sum(mi * gdP_col_stride + threadIdx.x / THREADS_PER_ROW) = dP_sum_cur;
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Just compute dot(do, o) and write the result (softmax_d) to global memory as a separate kernel.
-// This is used in the case where we want to parallelize the backward across seqlen_k.
-template<bool Clear_dQaccum=true, typename Kernel_traits, typename Params>
-inline __device__ void compute_dot_do_o(const Params &params) {
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    const int m_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockM = Kernel_traits::kBlockM;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
-
-    const index_t row_offset_do = binfo.q_offset(params.do_batch_stride, params.do_row_stride, bidb)
-        + m_block * kBlockM * params.do_row_stride + bidh * params.do_head_stride;
-    const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
-        + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
-    const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
-        + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
-    // Regarding 128 * params.b see a comment in mha_varlen_bwd about padding of dq_accum and softmax_d
-    const index_t row_offset_dpsum = (params.unpadded_lse ? (bidh * (params.total_q + 128 * params.b) + binfo.q_offset(params.seqlen_q_rounded, 1, bidb) + 128 * bidb): (bidb * params.h + bidh) * params.seqlen_q_rounded) + m_block * kBlockM;
-
-    Tensor gdO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.do_ptr) + row_offset_do),
-                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                             make_stride(params.do_row_stride, _1{}));
-    Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.o_ptr) + row_offset_o),
-                            Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                            make_stride(params.o_row_stride, _1{}));
-    Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dq_accum_ptr) + row_offset_dq_accum),
-                                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                                  make_stride(params.h * params.d_rounded, _1{}));
-    Tensor dP_sum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dsoftmax_sum) + row_offset_dpsum),
-                                Shape<Int<kBlockM>>{}, Stride<_1>{});
-
-    typename Kernel_traits::GmemTiledCopydO gmem_tiled_copy_dO;
-    auto gmem_thr_copy_dO = gmem_tiled_copy_dO.get_thread_slice(tidx);
-    // TODO: careful, we're zeroing out dQaccum with type float4, but when
-    // we do atomicAdds, we use type float. The layouts are different. Check this.
-    typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dQaccum;
-    auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(tidx);
-
-    Tensor tdOgdO = gmem_thr_copy_dO.partition_S(gdO);
-    Tensor tdOgO = gmem_thr_copy_dO.partition_S(gO);
-    Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
-
-    Tensor cdO = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdOcdO = gmem_thr_copy_dO.partition_S(cdO);
-
-    // Allocate predicate tensors for k
-    Tensor tdOpdO = make_tensor<bool>(make_shape(size<2>(tdOgdO)));
-    // Set predicates for k bounds
-    #pragma unroll
-    for (int k = 0; k < size(tdOpdO); ++k) {tdOpdO(k) = get<1>(tdOcdO(0, 0, k)) < params.d;}
-
-    Tensor tdOrdO = make_fragment_like(tdOgdO);
-    Tensor tdOrO = make_fragment_like(tdOgO);
-    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_dO, tdOgdO, tdOrdO, tdOcdO, tdOpdO, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_dO, tdOgO, tdOrO, tdOcdO, tdOpdO, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-    // By right we need to scale dP up by 1/p_dropout, but instead we don't and only scale the final
-    // results (dQ and dK) by 1/p_dropout. So we need to keep dP_sum scaled down by p_dropout here,
-    // so that (dP - dP_sum) is on the same scale.
-    dot_do_o<Kernel_traits::kGmemThreadsPerRow>(tdOrdO, tdOrO, dP_sum,
-                                                Kernel_traits::kNThreads / (Kernel_traits::kGmemThreadsPerRow), params.p_dropout);
-    if (Clear_dQaccum) {
-        // We're actually not zero'ing out all of dQaccum, but only the part that we're going to
-        // do atomicAdds on.
-        Tensor zero = make_fragment_like(tdQgdQaccum);
-        clear(zero);
-        cute::copy(gmem_tiled_copy_dQaccum, zero, tdQgdQaccum);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, typename Params>
-inline __device__ void clear_dKVaccum(const Params &params) {
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    const int n_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockN = Kernel_traits::kBlockN;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (n_block * kBlockN >= binfo.actual_seqlen_k) return;
-
-    const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.d_rounded;
-
-    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
-    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
-
-    typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dKVaccum;
-    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
-    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_D(gdKaccum);
-    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_D(gdVaccum);
-    Tensor zero = make_fragment_like(tdKgdKaccum);
-    clear(zero);
-    cute::copy(gmem_tiled_copy_dKVaccum, zero, tdKgdKaccum);
-    cute::copy(gmem_tiled_copy_dKVaccum, zero, tdVgdVaccum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convert dQ from dQaccum (in float) to fp16/bf16.
-// This is used in the case where we want to parallelize the backward across seqlen_k.
-template<typename Kernel_traits, typename Params>
-inline __device__ void convert_dQ(const Params &params, const int nsplits) {
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    const int m_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockM = Kernel_traits::kBlockM;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
-
-    const index_t row_offset_dq = binfo.q_offset(params.dq_batch_stride, params.dq_row_stride, bidb)
-        + m_block * kBlockM * params.dq_row_stride + bidh * params.dq_head_stride;
-    const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
-        + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
-
-    Tensor gdQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dq_ptr) + row_offset_dq),
-                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                             make_stride(params.dq_row_stride, _1{}));
-    Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dq_accum_ptr) + row_offset_dq_accum),
-                                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                                  make_stride(params.h * params.d_rounded, _1{}));
-
-    Tensor sdQ = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
-                             typename Kernel_traits::SmemLayoutdQ{});
-
-    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dQ;
-    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dQaccum;
-    auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(tidx);
-
-    typename Kernel_traits::TiledMmadQ tiled_mma_dq;
-    auto smem_tiled_copy_dQ = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdQ{}, tiled_mma_dq);
-    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(tidx);
-    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(sdQ);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    Tensor tdQsdQ = gmem_thr_copy_dQ.partition_S(sdQ);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
-    Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum);
-
-    Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-    CUTE_STATIC_ASSERT_V(size(acc_dq) == size(tdQgdQaccum));
-
-    Tensor tdQrdQaccum = make_fragment_like(tdQgdQaccum);
-    clear(acc_dq);
-    for (int s = 0; s < nsplits; ++s) {
-        cute::copy(gmem_tiled_copy_dQaccum, tdQgdQaccum, tdQrdQaccum);
-        #pragma unroll
-        for (int i = 0; i < size(acc_dq); ++i) { acc_dq(i) += tdQrdQaccum(i); }
-        tdQgdQaccum.data() = tdQgdQaccum.data() + params.dq_accum_split_stride;
-    }
-    #pragma unroll
-    for (int i = 0; i < size(acc_dq); ++i) { acc_dq(i) *= params.scale_softmax_rp_dropout; }
-    // Convert acc_dq from fp32 to fp16
-    Tensor rdQ = flash::convert_type<Element>(acc_dq);
-    Tensor taccdQrdQ = smem_thr_copy_dQ.retile_S(rdQ);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
-    __syncthreads();
-    Tensor tdQrdQ = make_tensor<Element>(shape(tdQgdQ));
-    cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ);
-
-    Tensor cdQ = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ);
-    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
-    #pragma unroll
-    for (int k = 0; k < size(tdQpdQ); ++k) { tdQpdQ(k) = get<1>(tdQcdQ(0, 0, k)) < params.d; }
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dQ, tdQrdQ, tdQgdQ, tdQcdQ, tdQpdQ, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convert dK and dV from dKaccum and dVaccum (in float) to fp16/bf16.
-// This is used in the case where we want to parallelize the backward across seqlen_q.
-template<typename Kernel_traits, typename Params>
-inline __device__ void convert_dKV(const Params &params) {
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    const int n_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockN = Kernel_traits::kBlockN;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (n_block * kBlockN >= binfo.actual_seqlen_k) return;
-
-    const index_t row_offset_dk = binfo.k_offset(params.dk_batch_stride, params.dk_row_stride, bidb)
-        + n_block * kBlockN * params.dk_row_stride + bidh * params.dk_head_stride;
-    const index_t row_offset_dv = binfo.k_offset(params.dv_batch_stride, params.dv_row_stride, bidb)
-        + n_block * kBlockN * params.dv_row_stride + bidh * params.dv_head_stride;
-    const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded
-                                          + n_block * kBlockN) * params.d_rounded;
-
-    Tensor gdK = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dk_ptr) + row_offset_dk),
-                             Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                             make_stride(params.dk_row_stride, _1{}));
-    Tensor gdV = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dv_ptr) + row_offset_dv),
-                             Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                             make_stride(params.dv_row_stride, _1{}));
-    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                                  Stride<Int<kHeadDim>, _1>{});
-    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                                  Stride<Int<kHeadDim>, _1>{});
-
-    Tensor sdK = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
-                             typename Kernel_traits::SmemLayoutdKV{});
-    Tensor sdV = make_tensor(sdK.data() + size(sdK), typename Kernel_traits::SmemLayoutdKV{}); // (SMEM_N, SMEM_K)
-
-    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dKV;
-    auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dKVaccum;
-    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
-
-    typename Kernel_traits::TiledMmadKV tiled_mma_dkv;
-    auto smem_tiled_copy_dKV = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdKV{}, tiled_mma_dkv);
-    auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(tidx);
-    Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(sdK);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(sdV);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    Tensor tdKsdK = gmem_thr_copy_dKV.partition_S(sdK);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdKgdK = gmem_thr_copy_dKV.partition_D(gdK);
-    Tensor tdVsdV = gmem_thr_copy_dKV.partition_S(sdV);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdVgdV = gmem_thr_copy_dKV.partition_D(gdV);
-    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_S(gdKaccum);
-    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_S(gdVaccum);
-
-    Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-    Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-    CUTE_STATIC_ASSERT_V(size(acc_dk) == size(tdKgdKaccum));
-    CUTE_STATIC_ASSERT_V(size(acc_dv) == size(tdVgdVaccum));
-
-    Tensor tdKrdKaccum = make_fragment_like(tdKgdKaccum);
-    Tensor tdVrdVaccum = make_fragment_like(tdVgdVaccum);
-    cute::copy(gmem_tiled_copy_dKVaccum, tdKgdKaccum, tdKrdKaccum);
-    cute::copy(gmem_tiled_copy_dKVaccum, tdVgdVaccum, tdVrdVaccum);
-    #pragma unroll
-    for (int i = 0; i < size(acc_dk); ++i) {
-        acc_dk(i) = tdKrdKaccum(i) * params.scale_softmax_rp_dropout;
-    }
-    #pragma unroll
-    for (int i = 0; i < size(acc_dv); ++i) {
-        acc_dv(i) = tdVrdVaccum(i) * params.rp_dropout;
-    }
-    // Convert acc_dk from fp32 to fp16
-    Tensor rdK = flash::convert_type<Element>(acc_dk);
-    Tensor rdV = flash::convert_type<Element>(acc_dv);
-    Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(rdK);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(rdV);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
-    cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
-    __syncthreads();
-    Tensor tdKrdK = make_tensor<Element>(shape(tdKgdK));
-    Tensor tdVrdV = make_tensor<Element>(shape(tdVgdV));
-    cute::copy(gmem_tiled_copy_dKV, tdKsdK, tdKrdK);
-    cute::copy(gmem_tiled_copy_dKV, tdVsdV, tdVrdV);
-
-    Tensor cdKV = make_identity_tensor(Shape<Int<kBlockN>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
-    Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKgdK)));
-    #pragma unroll
-    for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.d; }
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN
-    );
-    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN
-    );
-}
-
-} // namespace flash
--- a/csrc/flash_attn/src/flash_fwd_kernel.h
+++ b/csrc/flash_attn/src/flash_fwd_kernel.h
@@ -159,6 +159,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
    // Careful we're using the same smem for sQ and sK | sV if Share_Q_K_smem;
    Tensor sK = make_tensor(sQ.data() + (Kernel_traits::Share_Q_K_smem ? 0 : size(sQ)),
                            typename Kernel_traits::SmemLayoutKV{});
+
    Tensor sV = make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutKV{});
    Tensor sVt = make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{});
    Tensor sVtNoSwizzle = make_tensor(sV.data().get(), typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{});
@@ -579,16 +580,17 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
    // We move K and V to the last block.
    const int bidb_cache = params.cache_batch_idx == nullptr ? bidb : params.cache_batch_idx[bidb];
    const int *block_table = params.block_table == nullptr ? nullptr : params.block_table + bidb * params.block_table_batch_stride;
-    const int block_table_idx = block_table == nullptr ? 0 : (n_block_max - 1) * kBlockN / params.page_block_size;
-    const int block_table_offset = block_table == nullptr ? 0 : (n_block_max - 1) * kBlockN - block_table_idx * params.page_block_size;
    const index_t row_offset_k = block_table == nullptr
        ? binfo.k_offset(params.k_batch_stride, params.k_row_stride, bidb_cache)
          + (n_block_max - 1) * kBlockN * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride
-        : block_table[block_table_idx] * params.k_batch_stride + block_table_offset * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride;
+        : (bidh / params.h_h_k_ratio) * params.k_head_stride; // block addresses are later resolved per-thread
+
    const index_t row_offset_v = block_table == nullptr
        ? binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb_cache)
          + (n_block_max - 1) * kBlockN * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride
-        : block_table[block_table_idx] * params.v_batch_stride + block_table_offset * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride;
+        : (bidh / params.h_h_k_ratio) * params.v_head_stride;
+
+    

    Tensor mQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element*>(params.q_ptr) + binfo.q_offset(params.q_batch_stride, params.q_row_stride, bidb)),
                            make_shape(binfo.actual_seqlen_q, params.h, params.d),
@@ -602,7 +604,6 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
    Tensor gV = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.v_ptr) + row_offset_v),
                            Shape<Int<kBlockN>, Int<kHeadDim>>{},
                            make_stride(params.v_row_stride, _1{}));
-
    Tensor sQ = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
                            typename Kernel_traits::SmemLayoutQ{});
    Tensor sK = make_tensor(sQ.data() + size(sQ), typename Kernel_traits::SmemLayoutKV{});
@@ -610,15 +611,30 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
    Tensor sVt = make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{});
    Tensor sVtNoSwizzle = make_tensor(sV.data().get(), typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{});

-    typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_QKV;
-    auto gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_Q;
+    auto gmem_thr_copy_Q = gmem_tiled_copy_Q.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopyQKVPaged gmem_tiled_copy_KV;
+    auto gmem_thr_copy_KV = gmem_tiled_copy_KV.get_thread_slice(tidx);

-    Tensor tQgQ = gmem_thr_copy_QKV.partition_S(gQ);
-    Tensor tQsQ = gmem_thr_copy_QKV.partition_D(sQ);
-    Tensor tKgK = gmem_thr_copy_QKV.partition_S(gK);  // (KCPY, KCPY_N, KCPY_K)
-    Tensor tKsK = gmem_thr_copy_QKV.partition_D(sK);
-    Tensor tVgV = gmem_thr_copy_QKV.partition_S(gV);  // (VCPY, VCPY_N, VCPY_K)
-    Tensor tVsV = gmem_thr_copy_QKV.partition_D(sV);
+    Tensor tQgQ = gmem_thr_copy_Q.partition_S(gQ);
+    Tensor tQsQ = gmem_thr_copy_Q.partition_D(sQ);
+
+    Tensor tKgK_ = gmem_thr_copy_KV.partition_S(gK);  // (KCPY, KCPY_N, KCPY_K)
+    Tensor tKsK_ = gmem_thr_copy_KV.partition_D(sK);
+    Tensor tVgV_ = gmem_thr_copy_KV.partition_S(gV);  // (VCPY, VCPY_N, VCPY_K)
+    Tensor tVsV_ = gmem_thr_copy_KV.partition_D(sV);
+
+    Tensor tKgK = make_tensor(tKgK_.data(), reshape_thread_tile(tKgK_.layout()));
+    Tensor tKsK = make_tensor(tKsK_.data(), reshape_thread_tile(tKsK_.layout()));
+    Tensor tVgV = make_tensor(tVgV_.data(), reshape_thread_tile(tVgV_.layout()));
+    Tensor tVsV = make_tensor(tVsV_.data(), reshape_thread_tile(tVsV_.layout()));
+
+    if (block_table != nullptr) {
+        tKgK.data() = gK.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block_max, params.page_block_size,
+            block_table, params.k_batch_stride, params.k_row_stride);
+        tVgV.data() = gV.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block_max, params.page_block_size,
+            block_table, params.v_batch_stride, params.v_row_stride);
+    }

    typename Kernel_traits::TiledMma tiled_mma;
    auto thr_mma = tiled_mma.get_thread_slice(tidx);
@@ -656,8 +672,9 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
    Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)

    // Repeat the partitioning with identity layouts
-    Tensor tQcQ = gmem_thr_copy_QKV.partition_S(cQ);       // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
-    Tensor tKVcKV = gmem_thr_copy_QKV.partition_S(cKV);   // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+    Tensor tQcQ = gmem_thr_copy_Q.partition_S(cQ);       // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tKVcKV_ = gmem_thr_copy_KV.partition_S(cKV);   // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+    Tensor tKVcKV = make_tensor(tKVcKV_.data(), reshape_thread_tile(tKVcKV_.layout()));

    // Allocate predicate tensors for k
    Tensor tQpQ = make_tensor<bool>(make_shape(size<2>(tQsQ)));
@@ -674,11 +691,12 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
    // Prologue

    // Copy from Knew to K, optionally apply rotary embedding.
-    typename Kernel_traits::GmemTiledCopyRotcossin gmem_tiled_copy_rotary;
+    if constexpr (Append_KV) {
+        typename Kernel_traits::GmemTiledCopyRotcossinPaged gmem_tiled_copy_rotary;
        auto gmem_thr_copy_rotary = gmem_tiled_copy_rotary.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopyRotcossinCont gmem_tiled_copy_rotary_cont;
+        typename Kernel_traits::GmemTiledCopyRotcossinContPaged gmem_tiled_copy_rotary_cont;
        auto gmem_thr_copy_rotary_cont = gmem_tiled_copy_rotary_cont.get_thread_slice(tidx);
-    if constexpr (Append_KV) {
+        
        // Even if we have MQA / GQA, all threadblocks responsible for the same KV head are writing to
        // gmem. Technically it's a race condition, but they all write the same content anyway, and it's safe.
        // We want to do this so that all threadblocks can proceed right after they finish writing the KV cache.
@@ -695,10 +713,17 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
        Tensor gSinCont = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.rotary_sin_ptr) + row_offset_cossin),
                                      Shape<Int<kBlockN>, Int<kHeadDim>>{},
                                      make_stride(params.rotary_dim / 2, _1{}));
-        Tensor tRgCos = gmem_thr_copy_rotary.partition_S(gCos);
-        Tensor tRgSin = gmem_thr_copy_rotary.partition_S(gSin);
-        Tensor tRgCosCont = gmem_thr_copy_rotary_cont.partition_S(gCosCont);
-        Tensor tRgSinCont = gmem_thr_copy_rotary_cont.partition_S(gSinCont);
+                                      
+        Tensor tRgCos_ = gmem_thr_copy_rotary.partition_S(gCos);
+        Tensor tRgSin_ = gmem_thr_copy_rotary.partition_S(gSin);
+        Tensor tRgCosCont_ = gmem_thr_copy_rotary_cont.partition_S(gCosCont);
+        Tensor tRgSinCont_ = gmem_thr_copy_rotary_cont.partition_S(gSinCont);
+
+        Tensor tRgCos = make_tensor(tRgCos_.data(), reshape_thread_tile(tRgCos_.layout()));
+        Tensor tRgSin = make_tensor(tRgSin_.data(), reshape_thread_tile(tRgSin_.layout()));
+        Tensor tRgCosCont = make_tensor(tRgCosCont_.data(), reshape_flatten_thread_tile(tRgCosCont_.layout()));
+        Tensor tRgSinCont = make_tensor(tRgSinCont_.data(), reshape_flatten_thread_tile(tRgSinCont_.layout()));
+
        // if (cute::thread(0, 0)) { printf("rotary_cos_ptr = %p, gCos.data() = %p, tRgCos.data() = %p, rotary_dim = %d\n", params.rotary_cos_ptr, gCos.data(), tRgCos.data(), params.rotary_dim); }
        // if (cute::thread(8, 0)) { print_tensor(gCos); }
        // if (cute::thread(0, 0)) { print_tensor(tRgCos); }
@@ -721,8 +746,13 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
                                                + row_offset_vnew - binfo.seqlen_k_cache * params.vnew_row_stride),
                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
                                  make_stride(params.vnew_row_stride, _1{}));
-        Tensor tKgKnew = gmem_thr_copy_QKV.partition_S(gKnew);  // (KCPY, KCPY_N, KCPY_K)
-        Tensor tVgVnew = gmem_thr_copy_QKV.partition_S(gVnew);  // (VCPY, VCPY_N, VCPY_K)
+        typename Kernel_traits::GmemTiledCopyQKVPaged gmem_tiled_copy_KV_new;
+        auto gmem_thr_copy_KV_new = gmem_tiled_copy_KV_new.get_thread_slice(tidx);
+        Tensor tKgKnew_ = gmem_thr_copy_KV_new.partition_S(gKnew);  // (KCPY, KCPY_N, KCPY_K)
+        Tensor tVgVnew_ = gmem_thr_copy_KV_new.partition_S(gVnew);  // (VCPY, VCPY_N, VCPY_K)
+
+        auto tKgKnew = make_tensor(tKgKnew_.data(), reshape_thread_tile(tKgKnew_.layout()));
+        auto tVgVnew = make_tensor(tVgVnew_.data(), reshape_thread_tile(tVgVnew_.layout()));

        const int n_block_copy_min = std::max(n_block_min, binfo.seqlen_k_cache / kBlockN);
        auto tKgK_data = tKgK.data();
@@ -762,14 +792,10 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
                tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
            } else {
                if (n_block > n_block_copy_min) {
-                    const int block_table_idx_cur = n_block * kBlockN / params.page_block_size;
-                    const int block_table_offset_cur = n_block * kBlockN - block_table_idx_cur * params.page_block_size;
-                    const int block_table_idx_next = (n_block - 1) * kBlockN / params.page_block_size;
-                    const int block_table_offset_next = (n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size;
-                    const int table_diff = block_table[block_table_idx_next] - block_table[block_table_idx_cur];
-                    const int offset_diff = block_table_offset_next - block_table_offset_cur;
-                    tVgV.data() = tVgV.data() + table_diff * params.v_batch_stride + offset_diff * params.v_row_stride;
-                    tKgK.data() = tKgK.data() + table_diff * params.k_batch_stride + offset_diff * params.k_row_stride;
+                    tVgV.data() = gV.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block, params.page_block_size, 
+                        block_table, params.v_batch_stride, params.v_row_stride);
+                    tKgK.data() = gK.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block, params.page_block_size, 
+                        block_table, params.k_batch_stride, params.k_row_stride);
                }
            }
        }
@@ -782,9 +808,13 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
    // Read Q from gmem to smem, optionally apply rotary embedding.
    if (!Append_KV || params.rotary_dim == 0) {
        // We don't need to clear the sQ smem tiles since we'll only write out the valid outputs
-        flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tQgQ, tQsQ, tQcQ, tQpQ,
+        flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_Q, tQgQ, tQsQ, tQcQ, tQpQ,
                                           binfo.actual_seqlen_q - m_block * kBlockM);
    } else {
+        typename Kernel_traits::GmemTiledCopyRotcossin gmem_tiled_copy_rotary;
+        auto gmem_thr_copy_rotary = gmem_tiled_copy_rotary.get_thread_slice(tidx);
+        typename Kernel_traits::GmemTiledCopyRotcossinCont gmem_tiled_copy_rotary_cont;
+        auto gmem_thr_copy_rotary_cont = gmem_tiled_copy_rotary_cont.get_thread_slice(tidx);
        const index_t row_offset_cossin = (binfo.seqlen_k_cache + (params.leftpad_k == nullptr ? 0 : params.leftpad_k[bidb]) + (Is_causal || Is_local ? m_block * kBlockM : 0)) * (params.rotary_dim / 2);
        // If not causal, all the queries get the same the cos/sin, taken at location seqlen_k_cache.
        // We do this by setting the row stride of gCos / gSin to 0.
@@ -819,7 +849,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons

    int n_block = n_block_max - 1;
    // We don't need to clear the sK smem tiles since we'll mask out the scores anyway.
-    flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV,
+    flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_KV, tKgK, tKsK, tKVcKV, tKVpKV,
                                       binfo.actual_seqlen_k - n_block * kBlockN);
    cute::cp_async_fence();

@@ -858,17 +888,14 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
            if (block_table == nullptr) {
                tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
            } else {
-                const int block_table_idx_cur = (n_block + 1) * kBlockN / params.page_block_size;
-                const int block_table_offset_cur = (n_block + 1) * kBlockN - block_table_idx_cur * params.page_block_size;
-                const int block_table_idx_next = n_block * kBlockN / params.page_block_size;
-                const int block_table_offset_next = n_block * kBlockN - block_table_idx_next * params.page_block_size;
-                tVgV.data() = tVgV.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.v_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.v_row_stride;
+                tVgV.data() = gV.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block + 1, params.page_block_size,
+                    block_table, params.v_batch_stride, params.v_row_stride);
            }
-            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
+            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tVgV, tVsV, tKVcKV, tKVpKV);
        } else {
            // Clear the smem tiles to account for predicated off loads
            flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/true>(
-                gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN
+                gmem_tiled_copy_KV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN
            );
        }
        cute::cp_async_fence();
@@ -897,13 +924,10 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
            if (block_table == nullptr) {
                tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
            } else {
-                const int block_table_idx_cur = n_block * kBlockN / params.page_block_size;
-                const int block_table_offset_cur = n_block * kBlockN - block_table_idx_cur * params.page_block_size;
-                const int block_table_idx_next = (n_block - 1) * kBlockN / params.page_block_size;
-                const int block_table_offset_next =(n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size;
-                tKgK.data() = tKgK.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.k_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.k_row_stride;
+                tKgK.data() = gK.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block, params.page_block_size, 
+                    block_table, params.k_batch_stride, params.k_row_stride);
            }
-            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
+            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tKgK, tKsK, tKVcKV, tKVpKV);
            // This cp_async_fence needs to be in the if block, otherwise the synchronization
            // isn't right and we get race conditions.
            cute::cp_async_fence();
@@ -940,13 +964,10 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
        if (block_table == nullptr) {
            tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
        } else {
-            const int block_table_idx_cur = (n_block + 1) * kBlockN / params.page_block_size;
-            const int block_table_offset_cur = (n_block + 1) * kBlockN - block_table_idx_cur * params.page_block_size;
-            const int block_table_idx_next = n_block * kBlockN / params.page_block_size;
-            const int block_table_offset_next = n_block * kBlockN - block_table_idx_next * params.page_block_size;
-            tVgV.data() = tVgV.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.v_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.v_row_stride;
+            tVgV.data() = gV.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block + 1, params.page_block_size, 
+                block_table, params.v_batch_stride, params.v_row_stride);
        }
-        flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
+        flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tVgV, tVsV, tKVcKV, tKVpKV);
        cute::cp_async_fence();

        flash::gemm(
@@ -964,13 +985,10 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
            if (block_table == nullptr) {
                tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
            } else {
-                const int block_table_idx_cur = n_block * kBlockN / params.page_block_size;
-                const int block_table_offset_cur = n_block * kBlockN - block_table_idx_cur * params.page_block_size;
-                const int block_table_idx_next = (n_block - 1) * kBlockN / params.page_block_size;
-                const int block_table_offset_next = (n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size;
-                tKgK.data() = tKgK.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.k_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.k_row_stride;
+                tKgK.data() = gK.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(tidx, n_block, params.page_block_size, 
+                    block_table, params.k_batch_stride, params.k_row_stride);            
            }
-            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
+            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tKgK, tKsK, tKVcKV, tKVpKV);
            // This cp_async_fence needs to be in the if block, otherwise the synchronization
            // isn't right and we get race conditions.
            cute::cp_async_fence();

--- a/csrc/flash_attn/src/kernel_traits.h
+++ b/csrc/flash_attn/src/kernel_traits.h
@@ -131,6 +131,17 @@ struct Flash_fwd_kernel_traits : public Base {
        make_tiled_copy(Copy_Atom<Gmem_copy_struct, Element>{},
                        GmemLayoutAtom{},
                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
+
+    // from how many rows does each thread have to fetch
+    static constexpr int kGmemRowsPerThread = kBlockN / (kNThreads / kGmemThreadsPerRow);
+    // Here we assign a contiguous tile to each thread, rather than a 1x8 row every 
+    // (kNThreads / kGmemThreadsPerRow) rows, ensuring that the elements assigned to each thread
+    // do not cross a page boundary. This way, each thread need only fetch 1 page index per
+    // mainloop iteration. R>udimentary testing shows no slowdown.
+    using GmemTiledCopyQKVPaged = decltype(
+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, Element>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<Int<kGmemRowsPerThread>, _8>, Stride<_8, _1>>{}));
    using GmemTiledCopyO = decltype(
        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
                        GmemLayoutAtom{},
@@ -156,6 +167,14 @@ struct Flash_fwd_kernel_traits : public Base {
        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
                        GmemLayoutAtomRotcossin{},
                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per load
+    using GmemTiledCopyRotcossinPaged = decltype(
+        make_tiled_copy(Copy_Atom<UniversalCopy<uint64_t>, Element>{},
+                        GmemLayoutAtomRotcossin{},
+                        Layout<Shape<Int<kGmemRowsPerThread>, _4>, Stride<_4, _1>>{}));  // Val layout, 4 vals per load
+    using GmemTiledCopyRotcossinContPaged = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                        GmemLayoutAtomRotcossin{},
+                        Layout<Shape<Int<kGmemRowsPerThread>, _8>, Stride<_8, _1>>{}));  // Val layout, 8 vals per load
 };

 // Is_V_in_regs is an option to reduce smem usage, but will increase register pressue.

--- a/csrc/flash_attn/src/utils.h
+++ b/csrc/flash_attn/src/utils.h
@@ -291,6 +291,53 @@ void cp_async_wait() {

 ////////////////////////////////////////////////////////////////////////////////////////////////////

+// resolves offset of a slice of a paged kv copy from gmem.
+// assumes that the tensor has already been positioned at the correct head.
+template <typename Kernel_traits>
+__forceinline__ __device__
+int64_t resolve_thread_kv_page_slice_offset(const int tidx, const int n_block_max, const int page_block_size, 
+                            const int* block_table, const int page_stride, const int row_stride) {
+    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
+    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
+    constexpr int kGmemElemsPerLoad = Kernel_traits::kGmemElemsPerLoad;
+    constexpr int kBlockN = Kernel_traits::kBlockN;
+    
+    const int64_t col_offset = tidx % kGmemThreadsPerRow * kGmemElemsPerLoad;
+    const int64_t block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
+    const int64_t global_row_offset = block_row_offset + (n_block_max - 1) * kBlockN;
+    const int64_t page_offset = global_row_offset % page_block_size;
+    const int64_t virtual_page_idx = global_row_offset / page_block_size;
+
+    return ((int64_t) block_table[virtual_page_idx]) * ((int64_t) page_stride)
+        + page_offset * ((int64_t) row_stride)
+        + col_offset;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Layout reshape function. Given a layout with modes ((v1, v2), m, k), returns (v1, v2, k),         
+// where v2 may be a tuple itself, in the case of swizzled smem-backed thread tiles. This ensures
+// that paged and non-paged copies result in equivalently shaped, if not necessarily strided, tensors.
+template <class Shape, class Stride>
+__forceinline__ __device__
+auto reshape_thread_tile(Layout<Shape, Stride> l) {
+    return make_layout(append(get<0>(l.shape()), get<2>(l.shape())),
+                        append(get<0>(l.stride()), get<2>(l.stride())));
+}
+
+// reshapes and flattens the thread tile layout. A separate function is needed for the case where
+// one of the modes of l is a layout itself and must be flattened, as opposed to keeping it intact
+// for the case of swizzled layouts
+template <class Shape, class Stride>
+__forceinline__ __device__
+auto reshape_flatten_thread_tile(Layout<Shape, Stride> l) {
+    auto mode_0 = filter(flatten(get<0>(l)));
+    return make_layout(append(mode_0.shape(), get<2>(l.shape())),
+                        append(mode_0.stride(), get<2>(l.stride())));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>

--- a/flash_attn/bert_padding.py
+++ b/flash_attn/bert_padding.py
-# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-
-
-class IndexFirstAxis(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = other_shape.numel()
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
-        ).reshape(-1, *other_shape)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (indices,) = ctx.saved_tensors
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        grad_output = rearrange(grad_output, "b ... -> b (...)")
-        grad_input = torch.zeros(
-            [ctx.first_axis_dim, grad_output.shape[1]],
-            device=grad_output.device,
-            dtype=grad_output.dtype,
-        )
-        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
-        # grad_input[indices] = grad_output
-        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
-        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
-
-
-index_first_axis = IndexFirstAxis.apply
-
-
-class IndexPutFirstAxis(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, values, indices, first_axis_dim):
-        ctx.save_for_backward(indices)
-        assert indices.ndim == 1
-        assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
-        )
-        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
-        output[indices] = values
-        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (indices,) = ctx.saved_tensors
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        grad_values = grad_output[indices]
-        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
-        return grad_values, None, None
-
-
-index_put_first_axis = IndexPutFirstAxis.apply
-
-
-class IndexFirstAxisResidual(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, indices):
-        ctx.save_for_backward(indices)
-        assert input.ndim >= 2
-        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
-        second_dim = other_shape.numel()
-        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
-        output = input[indices]
-        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
-        # memory format to channel_first. In other words, input might not be contiguous.
-        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
-        return output, input.detach()
-
-    @staticmethod
-    def backward(ctx, grad_output, grad_residual):
-        (indices,) = ctx.saved_tensors
-        assert grad_output.ndim >= 2
-        other_shape = grad_output.shape[1:]
-        assert grad_residual.shape[1:] == other_shape
-        grad_input = grad_residual
-        # grad_input[indices] += grad_output
-        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
-        indices = indices.expand_as(grad_output)
-        grad_input.scatter_add_(0, indices, grad_output)
-        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
-
-
-index_first_axis_residual = IndexFirstAxisResidual.apply
-
-
-def unpad_input(hidden_states, attention_mask):
-    """
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
-    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
-    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
-    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
-    # so we write custom forward and backward to make it a bit faster.
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
-    """
-    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
-    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
-    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
-        ```
-        [
-          [2, 3, 0, 0, 0, 0],
-          [3, 2, 0, 0, 0, 0],
-          [6, 0, 0, 0, 0, 0]
-        ]
-        ```
-    , which refers to the 3D-attention mask:
-        ```
-        [
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [0, 0, 1, 0, 0, 0],
-            [0, 0, 1, 1, 0, 0],
-            [0, 0, 1, 1, 1, 0],
-            [0, 0, 0, 0, 0, 1]
-          ],
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [1, 1, 1, 0, 0, 0],
-            [0, 0, 0, 1, 0, 0],
-            [0, 0, 0, 1, 1, 0],
-            [0, 0, 0, 0, 0, 1]
-          ],
-          [
-            [1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0],
-            [1, 1, 1, 0, 0, 0],
-            [1, 1, 1, 1, 0, 0],
-            [1, 1, 1, 1, 1, 0],
-            [1, 1, 1, 1, 1, 1]
-          ]
-        ]
-        ```.
-
-    Arguments:
-        hidden_states: (batch, seqlen, ...)
-        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
-    Return:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
-        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
-        max_seqlen_in_batch: int
-    """
-    length = attention_mask_in_length.sum(dim=-1)
-    seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
-    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
-    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
-    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
-    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
-    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
-    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
-    # so we write custom forward and backward to make it a bit faster.
-    return (
-        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def pad_input(hidden_states, indices, batch, seqlen):
-    """
-    Arguments:
-        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
-        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
-        batch: int, batch size for the padded sequence.
-        seqlen: int, maximum sequence length for the padded sequence.
-    Return:
-        hidden_states: (batch, seqlen, ...)
-    """
-    dim = hidden_states.shape[-1]
-    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
-    # output[indices] = hidden_states
-    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return rearrange(output, "(b s) ... -> b s ...", b=batch)
--- a/flash_attn/flash_attn_triton.py
+++ b/flash_attn/flash_attn_triton.py
-"""
-*Experimental* implementation of FlashAttention in Triton.
-Tested with triton==2.0.0.dev20221202.
-Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
-other than 64:
-https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
-We'll update this implementation with the new Triton backend once this is fixed.
-
-We use the FlashAttention implementation from Phil Tillet a starting point.
-https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-
-Changes:
- Implement both causal and non-causal attention.
- Implement both self-attention and cross-attention.
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
- Support attention bias.
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
- Make the backward for d=128 much faster by reducing register spilling.
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
-small batch size * nheads.
-
-Caution:
- This is an *experimental* implementation. The forward pass should be quite robust but
-I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
- This implementation has only been tested on A100.
- If you plan to use headdim other than 64 and 128, you should test for race conditions
-(due to the Triton compiler), as done in tests/test_flash_attn.py
-"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
-for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
-that there are none left for other head dimensions.
-
-Differences between this Triton version and the CUDA version:
- Triton version doesn't support dropout.
- Triton forward is generally faster than CUDA forward, while Triton backward is
-generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
-than CUDA forward + backward.
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
- Triton version supports attention bias, while CUDA version doesn't.
-"""
-
-import math
-
-import torch
-import triton
-import triton.language as tl
-
-
-# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
-# @triton.autotune(
-#     configs=[
-#         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
-#         # This config has a race condition when EVEN_M == False, disabling it for now.
-#         # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
-#     ],
-#     key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
-# )
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    Out,
-    Lse,
-    TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # off_b = tl.program_id(1)
-    # off_h = tl.program_id(2)
-    # off_hb = off_b * nheads + off_h
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # Initialize pointers to Q, K, V
-    # Adding parenthesis around indexing might use int32 math instead of int64 math?
-    # https://github.com/openai/triton/issues/741
-    # I'm seeing a tiny bit of difference (5-7us)
-    q_ptrs = (
-        Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
-    )
-    k_ptrs = (
-        K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    )
-    v_ptrs = (
-        V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    )
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = (
-            Bias
-            + off_b * stride_bb
-            + off_h * stride_bh
-            + (offs_m[:, None] * stride_bm + offs_n[None, :])
-        )
-    # initialize pointer to m and l
-    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
-    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
-    # tl.load(q_ptrs), we get the wrong output!
-    if EVEN_M & EVEN_N:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    else:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
-        else:
-            q = tl.load(
-                q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0
-            )
-    # loop over k, v and update accumulator
-    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
-    for start_n in range(0, end_n, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn)
-            else:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                k = tl.load(
-                    k_ptrs + start_n * stride_kn,
-                    mask=(start_n + offs_n)[:, None] < seqlen_k,
-                    other=0.0,
-                )
-            else:
-                k = tl.load(
-                    k_ptrs + start_n * stride_kn,
-                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                    other=0.0,
-                )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        # Trying to combine the two masks seem to make the result wrong
-        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
-        if IS_CAUSAL:
-            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
-        if BIAS_TYPE != "none":
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(
-                        b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0
-                    ).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(
-                        b_ptrs + start_n,
-                        mask=(offs_m[:, None] < seqlen_q)
-                        & ((start_n + offs_n)[None, :] < seqlen_k),
-                        other=0.0,
-                    ).to(tl.float32)
-            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
-            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
-            # to multiply with softmax_scale here.
-            qk = qk * softmax_scale + bias
-            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
-            p = tl.exp(qk - m_ij[:, None])
-        else:
-            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
-            p = tl.exp(qk * softmax_scale - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-
-        # scale acc_o
-        acc_o_scale = tl.exp(m_i - m_ij)
-
-        # # -- update output accumulator --
-        # BUG: have to store and immediately load
-        tl.store(t_ptrs, acc_o_scale)
-        acc_o_scale = tl.load(t_ptrs)
-        acc_o = acc_o * acc_o_scale[:, None]
-        # update acc_o
-        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn)
-            else:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                v = tl.load(
-                    v_ptrs + start_n * stride_vn,
-                    mask=(start_n + offs_n)[:, None] < seqlen_k,
-                    other=0.0,
-                )
-            else:
-                v = tl.load(
-                    v_ptrs + start_n * stride_vn,
-                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                    other=0.0,
-                )
-        p = p.to(v.dtype)
-        acc_o += tl.dot(p, v)
-
-        # -- update statistics
-        m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
-
-    o_scale = tl.exp(m_i - lse_i)
-    # BUG: have to store and immediately load
-    tl.store(t_ptrs, o_scale)
-    o_scale = tl.load(t_ptrs)
-    acc_o = acc_o * o_scale[:, None]
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
-    tl.store(lse_ptrs, lse_i)
-    # initialize pointers to output
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    out_ptrs = (
-        Out
-        + off_b * stride_ob
-        + off_h * stride_oh
-        + (offs_m[:, None] * stride_om + offs_d[None, :])
-    )
-    if EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o)
-        else:
-            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
-    else:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
-        else:
-            tl.store(
-                out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
-            )
-
-
-@triton.jit
-def _bwd_preprocess_do_o_dot(
-    Out,
-    DO,
-    Delta,
-    stride_ob,
-    stride_oh,
-    stride_om,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    nheads,
-    seqlen_q,
-    seqlen_q_rounded,
-    headdim,
-    BLOCK_M: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # load
-    o = tl.load(
-        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    do = tl.load(
-        DO
-        + off_b * stride_dob
-        + off_h * stride_doh
-        + offs_m[:, None] * stride_dom
-        + offs_d[None, :],
-        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-        other=0.0,
-    ).to(tl.float32)
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
-
-
-@triton.jit
-def _bwd_store_dk_dv(
-    dk_ptrs,
-    dv_ptrs,
-    dk,
-    dv,
-    offs_n,
-    offs_d,
-    seqlen_k,
-    headdim,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-):
-    # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
-    # if we just call tl.store(dv_ptrs), there's a race condition
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv)
-            tl.store(dk_ptrs, dk)
-        else:
-            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
-            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
-    else:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
-            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
-        else:
-            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-
-
-@triton.jit
-def _bwd_kernel_one_col_block(
-    start_n,
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qm,
-    stride_kn,
-    stride_vn,
-    stride_bm,
-    stride_dom,
-    stride_dqm,
-    stride_dkn,
-    stride_dvn,
-    seqlen_q,
-    seqlen_k,
-    headdim,
-    ATOMIC_ADD: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
-    begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
-    # initialize row/col offsets
-    offs_qm = begin_m + tl.arange(0, BLOCK_M)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # initialize pointers to value-like data
-    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
-    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
-    if BIAS_TYPE == "vector":
-        b_ptrs = Bias + offs_n
-    elif BIAS_TYPE == "matrix":
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
-    # initialize dv and dk
-    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    # There seems to be some problem with Triton pipelining that makes results wrong for
-    # headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop
-    # may have zero step, and pipelining with the bias matrix could screw it up.
-    # So we just exit early.
-    if begin_m >= seqlen_q:
-        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-        _bwd_store_dk_dv(
-            dk_ptrs,
-            dv_ptrs,
-            dk,
-            dv,
-            offs_n,
-            offs_d,
-            seqlen_k,
-            headdim,
-            EVEN_M=EVEN_M,
-            EVEN_N=EVEN_N,
-            EVEN_HEADDIM=EVEN_HEADDIM,
-        )
-        return
-    # k and v stay in SRAM throughout
-    # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
-    # if we just call tl.load(k_ptrs), we get the wrong output!
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs)
-            v = tl.load(v_ptrs)
-        else:
-            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    else:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-        else:
-            k = tl.load(
-                k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
-            )
-            v = tl.load(
-                v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
-            )
-    # loop over rows
-    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
-    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
-        start_m = tl.multiple_of(start_m, BLOCK_M)
-        offs_m_curr = start_m + offs_m
-        # load q, k, v, do on-chip
-        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
-        if EVEN_M & EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            if EVEN_HEADDIM:
-                q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-            else:
-                q = tl.load(
-                    q_ptrs,
-                    mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                    other=0.0,
-                )
-        # recompute p = softmax(qk, dim=-1).T
-        qk = tl.dot(q, k, trans_b=True)
-        # Trying to combine the two masks seem to make the result wrong
-        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
-        if IS_CAUSAL:
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-        if BIAS_TYPE != "none":
-            tl.debug_barrier()  # Race condition otherwise
-            if BIAS_TYPE == "vector":
-                if EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == "matrix":
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(
-                        b_ptrs,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k),
-                        other=0.0,
-                    ).to(tl.float32)
-            qk = qk * softmax_scale + bias
-        # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
-        # Also wrong for headdim=64.
-        if not (EVEN_M & EVEN_HEADDIM):
-            tl.debug_barrier()
-        lse_i = tl.load(LSE + offs_m_curr)
-        if BIAS_TYPE == "none":
-            p = tl.exp(qk * softmax_scale - lse_i[:, None])
-        else:
-            p = tl.exp(qk - lse_i[:, None])
-        # compute dv
-        # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
-        # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
-        # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
-        # the output is correct.
-        if EVEN_M & EVEN_HEADDIM:
-            do = tl.load(do_ptrs)
-        else:
-            # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
-            do = tl.load(
-                do_ptrs,
-                mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                other=0.0,
-            )
-        # if EVEN_M:
-        #     if EVEN_HEADDIM:
-        #         do = tl.load(do_ptrs)
-        #     else:
-        #         do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-        # else:
-        #     if EVEN_HEADDIM:
-        #         do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-        #     else:
-        #         do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
-        #                                    & (offs_d[None, :] < headdim), other=0.0)
-        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-        # compute dp = dot(v, do)
-        # There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
-        # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
-        # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
-        if not (EVEN_M & EVEN_HEADDIM):
-            tl.debug_barrier()
-        dp = tl.dot(do, v, trans_b=True)
-        # There's a race condition for headdim=48
-        if not EVEN_HEADDIM:
-            tl.debug_barrier()
-        # compute ds = p * (dp - delta[:, None])
-        # Putting the subtraction after the dp matmul (instead of before) is slightly faster
-        Di = tl.load(D + offs_m_curr)
-        # Converting ds to q.dtype here reduces register pressure and makes it much faster
-        # for BLOCK_HEADDIM=128
-        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
-        # compute dk = dot(ds.T, q)
-        dk += tl.dot(ds, q, trans_a=True)
-        # compute dq
-        if not (
-            EVEN_M & EVEN_HEADDIM
-        ):  # Otherewise there's a race condition when BIAS_TYPE='matrix'
-            tl.debug_barrier()
-        if not ATOMIC_ADD:
-            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
-                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            else:
-                if EVEN_HEADDIM:
-                    dq = tl.load(
-                        dq_ptrs,
-                        mask=offs_m_curr[:, None] < seqlen_q,
-                        other=0.0,
-                        eviction_policy="evict_last",
-                    )
-                    dq += tl.dot(ds, k)
-                    tl.store(
-                        dq_ptrs,
-                        dq,
-                        mask=offs_m_curr[:, None] < seqlen_q,
-                        eviction_policy="evict_last",
-                    )
-                else:
-                    dq = tl.load(
-                        dq_ptrs,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                        other=0.0,
-                        eviction_policy="evict_last",
-                    )
-                    dq += tl.dot(ds, k)
-                    tl.store(
-                        dq_ptrs,
-                        dq,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                        eviction_policy="evict_last",
-                    )
-        else:  # If we're parallelizing across the seqlen_k dimension
-            dq = tl.dot(ds, k)
-            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
-                tl.atomic_add(dq_ptrs, dq)
-            else:
-                if EVEN_HEADDIM:
-                    tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
-                else:
-                    tl.atomic_add(
-                        dq_ptrs,
-                        dq,
-                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                    )
-        # increment pointers
-        dq_ptrs += BLOCK_M * stride_dqm
-        q_ptrs += BLOCK_M * stride_qm
-        do_ptrs += BLOCK_M * stride_dom
-        if BIAS_TYPE == "matrix":
-            b_ptrs += BLOCK_M * stride_bm
-    # write-back
-    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-    _bwd_store_dk_dv(
-        dk_ptrs,
-        dv_ptrs,
-        dk,
-        dv,
-        offs_n,
-        offs_d,
-        seqlen_k,
-        headdim,
-        EVEN_M=EVEN_M,
-        EVEN_N=EVEN_N,
-        EVEN_HEADDIM=EVEN_HEADDIM,
-    )
-
-
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-
-
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False},
-            num_warps=8,
-            num_stages=1,
-            pre_hook=init_to_zero("DQ"),
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True},
-            num_warps=8,
-            num_stages=1,
-            pre_hook=init_to_zero("DQ"),
-        ),
-        # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
-        # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
-        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
-    ],
-    key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
-)
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    Bias,
-    DO,
-    DQ,
-    DK,
-    DV,
-    LSE,
-    D,
-    softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_dob,
-    stride_doh,
-    stride_dom,
-    stride_dqb,
-    stride_dqh,
-    stride_dqm,
-    stride_dkb,
-    stride_dkh,
-    stride_dkn,
-    stride_dvb,
-    stride_dvh,
-    stride_dvn,
-    nheads,
-    seqlen_q,
-    seqlen_k,
-    seqlen_q_rounded,
-    headdim,
-    CACHE_KEY_SEQLEN_Q,
-    CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    SEQUENCE_PARALLEL: tl.constexpr,
-    EVEN_M: tl.constexpr,
-    EVEN_N: tl.constexpr,
-    EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # offset pointers for batch/head
-    Q += off_b * stride_qb + off_h * stride_qh
-    K += off_b * stride_kb + off_h * stride_kh
-    V += off_b * stride_vb + off_h * stride_vh
-    DO += off_b * stride_dob + off_h * stride_doh
-    DQ += off_b * stride_dqb + off_h * stride_dqh
-    DK += off_b * stride_dkb + off_h * stride_dkh
-    DV += off_b * stride_dvb + off_h * stride_dvh
-    if BIAS_TYPE != "none":
-        Bias += off_b * stride_bb + off_h * stride_bh
-    # pointer to row-wise quantities in value-like data
-    D += off_hb * seqlen_q_rounded
-    LSE += off_hb * seqlen_q_rounded
-    if not SEQUENCE_PARALLEL:
-        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
-            _bwd_kernel_one_col_block(
-                start_n,
-                Q,
-                K,
-                V,
-                Bias,
-                DO,
-                DQ,
-                DK,
-                DV,
-                LSE,
-                D,
-                softmax_scale,
-                stride_qm,
-                stride_kn,
-                stride_vn,
-                stride_bm,
-                stride_dom,
-                stride_dqm,
-                stride_dkn,
-                stride_dvn,
-                seqlen_q,
-                seqlen_k,
-                headdim,
-                ATOMIC_ADD=False,
-                BIAS_TYPE=BIAS_TYPE,
-                IS_CAUSAL=IS_CAUSAL,
-                BLOCK_HEADDIM=BLOCK_HEADDIM,
-                EVEN_M=EVEN_M,
-                EVEN_N=EVEN_N,
-                EVEN_HEADDIM=EVEN_HEADDIM,
-                BLOCK_M=BLOCK_M,
-                BLOCK_N=BLOCK_N,
-            )
-    else:
-        start_n = tl.program_id(0)
-        _bwd_kernel_one_col_block(
-            start_n,
-            Q,
-            K,
-            V,
-            Bias,
-            DO,
-            DQ,
-            DK,
-            DV,
-            LSE,
-            D,
-            softmax_scale,
-            stride_qm,
-            stride_kn,
-            stride_vn,
-            stride_bm,
-            stride_dom,
-            stride_dqm,
-            stride_dkn,
-            stride_dvn,
-            seqlen_q,
-            seqlen_k,
-            headdim,
-            ATOMIC_ADD=True,
-            BIAS_TYPE=BIAS_TYPE,
-            IS_CAUSAL=IS_CAUSAL,
-            BLOCK_HEADDIM=BLOCK_HEADDIM,
-            EVEN_M=EVEN_M,
-            EVEN_N=EVEN_N,
-            EVEN_HEADDIM=EVEN_HEADDIM,
-            BLOCK_M=BLOCK_M,
-            BLOCK_N=BLOCK_N,
-        )
-
-
-def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
-    # shape constraints
-    batch, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, _, _ = k.shape
-    assert k.shape == (batch, seqlen_k, nheads, d)
-    assert v.shape == (batch, seqlen_k, nheads, d)
-    assert d <= 128, "FlashAttention only support head dimensions up to 128"
-    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
-    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
-    assert q.is_cuda and k.is_cuda and v.is_cuda
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        if bias.stride(-1) != 1:
-            bias = bias.contiguous()
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError(
-                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
-            )
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    o = torch.empty_like(q)
-
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    BLOCK = 128
-    num_warps = 4 if d <= 64 else 8
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _fwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        o,
-        lse,
-        tmp,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,  # key for triton cache (limit number of compilations)
-        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
-        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM,
-        BLOCK_M=BLOCK,
-        BLOCK_N=BLOCK,
-        num_warps=num_warps,
-        num_stages=1,
-    )
-    return o, lse, softmax_scale  # softmax_scale could have been updated
-
-
-def _flash_attn_backward(
-    do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None
-):
-    # Make sure that the last dimension is contiguous
-    if do.stride(-1) != 1:
-        do = do.contiguous()
-    batch, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, _, _ = k.shape
-    # assert d in {16, 32, 64, 128}
-    assert d <= 128
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    assert lse.shape == (batch, nheads, seqlen_q_rounded)
-    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
-    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    # dq_accum = torch.zeros_like(q, dtype=torch.float32)
-    dq_accum = torch.empty_like(q, dtype=torch.float32)
-    delta = torch.empty_like(lse)
-    # delta = torch.zeros_like(lse)
-
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _bwd_preprocess_do_o_dot[grid](
-        o,
-        do,
-        delta,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_q_rounded,
-        d,
-        BLOCK_M=128,
-        BLOCK_HEADDIM=BLOCK_HEADDIM,
-    )
-
-    has_bias = bias is not None
-    bias_type = "none"
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.stride(-1) == 1
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = "vector"
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = "matrix"
-        else:
-            raise RuntimeError(
-                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
-            )
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-
-    # BLOCK_M = 128
-    # BLOCK_N = 64
-    # num_warps = 4
-    grid = lambda META: (
-        triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
-        batch * nheads,
-    )
-    _bwd_kernel[grid](
-        q,
-        k,
-        v,
-        bias,
-        do,
-        dq_accum,
-        dk,
-        dv,
-        lse,
-        delta,
-        softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
-        *bias_strides,
-        do.stride(0),
-        do.stride(2),
-        do.stride(1),
-        dq_accum.stride(0),
-        dq_accum.stride(2),
-        dq_accum.stride(1),
-        dk.stride(0),
-        dk.stride(2),
-        dk.stride(1),
-        dv.stride(0),
-        dv.stride(2),
-        dv.stride(1),
-        nheads,
-        seqlen_q,
-        seqlen_k,
-        seqlen_q_rounded,
-        d,
-        seqlen_q // 32,
-        seqlen_k // 32,  # key for triton cache (limit number of compilations)
-        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
-        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
-        bias_type,
-        causal,
-        BLOCK_HEADDIM,
-        # SEQUENCE_PARALLEL=False,
-        # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-        # num_warps=num_warps,
-        # num_stages=1,
-    )
-    dq.copy_(dq_accum)
-
-
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
-        """
-        # Make sure that the last dimension is contiguous
-        if qkv.stride(-1) != 1:
-            qkv = qkv.contiguous()
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            qkv[:, :, 0],
-            qkv[:, :, 1],
-            qkv[:, :, 2],
-            bias=bias,
-            causal=causal,
-            softmax_scale=softmax_scale,
-        )
-        ctx.save_for_backward(qkv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        qkv, o, lse, bias = ctx.saved_tensors
-        assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dqkv = torch.empty_like(qkv)
-            _flash_attn_backward(
-                do,
-                qkv[:, :, 0],
-                qkv[:, :, 1],
-                qkv[:, :, 2],
-                o,
-                lse,
-                dqkv[:, :, 0],
-                dqkv[:, :, 1],
-                dqkv[:, :, 2],
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return dqkv, None, None, None
-
-
-flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
-
-
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch, seqlen_q, nheads, headdim)
-        kv: (batch, seqlen_k, 2, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        # Make sure that the last dimension is contiguous
-        q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(q, kv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, kv, o, lse, bias = ctx.saved_tensors
-        if len(ctx.needs_input_grad) >= 3:
-            assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dkv = torch.empty_like(kv)
-            _flash_attn_backward(
-                do,
-                q,
-                kv[:, :, 0],
-                kv[:, :, 1],
-                o,
-                lse,
-                dq,
-                dkv[:, :, 0],
-                dkv[:, :, 1],
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return dq, dkv, None, None, None
-
-
-flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
-
-
-class FlashAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
-        """
-        q: (batch_size, seqlen_q, nheads, headdim)
-        k, v: (batch_size, seqlen_k, nheads, headdim)
-        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        # Make sure that the last dimension is contiguous
-        q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(q, k, v, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, lse, bias = ctx.saved_tensors
-        assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dk = torch.empty_like(k)
-            dv = torch.empty_like(v)
-            _flash_attn_backward(
-                do,
-                q,
-                k,
-                v,
-                o,
-                lse,
-                dq,
-                dk,
-                dv,
-                bias=bias,
-                causal=ctx.causal,
-                softmax_scale=ctx.softmax_scale,
-            )
-        return dq, dk, dv, None, None, None
-
-
-flash_attn_func = FlashAttnFunc.apply
--- a/flash_attn/flash_attn_triton_og.py
+++ b/flash_attn/flash_attn_triton_og.py
-# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-# for benchmarking.
-# We fixed a few dtype cast to make it work for bf16
-
-"""
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
-"""
-
-import pytest
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    TMP,
-    L,
-    M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
-    Out,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    Z,
-    H,
-    N_CTX,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hz = tl.program_id(1)
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
-    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    # Initialize pointers to Q, K, V
-    q_ptrs = Q + off_q
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-    # initialize pointer to m and l
-    t_ptrs = TMP + off_hz * N_CTX + offs_m
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    q = tl.load(q_ptrs)
-    # loop over k, v and update accumulator
-    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(k_ptrs + start_n * stride_kn)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        qk *= sm_scale
-        qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        p = tl.exp(qk - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-        m_i_new = tl.maximum(m_i, m_ij)
-        alpha = tl.exp(m_i - m_i_new)
-        beta = tl.exp(m_ij - m_i_new)
-        l_i_new = alpha * l_i + beta * l_ij
-        # -- update output accumulator --
-        # scale p
-        p_scale = beta / l_i_new
-        p = p * p_scale[:, None]
-        # scale acc
-        acc_scale = l_i / l_i_new * alpha
-        tl.store(t_ptrs, acc_scale)
-        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(v_ptrs + start_n * stride_vk)
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    l_ptrs = L + off_hz * N_CTX + offs_m
-    m_ptrs = M + off_hz * N_CTX + offs_m
-    tl.store(l_ptrs, l_i)
-    tl.store(m_ptrs, m_i)
-    # initialize pointers to output
-    offs_n = tl.arange(0, BLOCK_DMODEL)
-    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
-
-
-@triton.jit
-def _bwd_preprocess(
-    Out,
-    DO,
-    L,
-    NewDO,
-    Delta,
-    BLOCK_M: tl.constexpr,
-    D_HEAD: tl.constexpr,
-):
-    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_n = tl.arange(0, D_HEAD)
-    # load
-    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
-    denom = tl.load(L + off_m).to(tl.float32)
-    # compute
-    do = do / denom[:, None]
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
-    tl.store(Delta + off_m, delta)
-
-
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    Out,
-    DO,
-    DQ,
-    DK,
-    DV,
-    L,
-    M,
-    D,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    Z,
-    H,
-    N_CTX,
-    num_block,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    off_hz = tl.program_id(0)
-    off_z = off_hz // H
-    off_h = off_hz % H
-    # offset pointers for batch/head
-    Q += off_z * stride_qz + off_h * stride_qh
-    K += off_z * stride_qz + off_h * stride_qh
-    V += off_z * stride_qz + off_h * stride_qh
-    DO += off_z * stride_qz + off_h * stride_qh
-    DQ += off_z * stride_qz + off_h * stride_qh
-    DK += off_z * stride_qz + off_h * stride_qh
-    DV += off_z * stride_qz + off_h * stride_qh
-    for start_n in range(0, num_block):
-        lo = start_n * BLOCK_M
-        # initialize row/col offsets
-        offs_qm = lo + tl.arange(0, BLOCK_M)
-        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
-        offs_m = tl.arange(0, BLOCK_N)
-        offs_k = tl.arange(0, BLOCK_DMODEL)
-        # initialize pointers to value-like data
-        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        # pointer to row-wise quantities in value-like data
-        D_ptrs = D + off_hz * N_CTX
-        m_ptrs = M + off_hz * N_CTX
-        # initialize dv amd dk
-        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-        # k and v stay in SRAM throughout
-        k = tl.load(k_ptrs)
-        v = tl.load(v_ptrs)
-        # loop over rows
-        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
-            offs_m_curr = start_m + offs_m
-            # load q, k, v, do on-chip
-            q = tl.load(q_ptrs)
-            # recompute p = softmax(qk, dim=-1).T
-            # NOTE: `do` is pre-divided by `l`; no normalization here
-            qk = tl.dot(q, k, trans_b=True)
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-            m = tl.load(m_ptrs + offs_m_curr)
-            p = tl.exp(qk * sm_scale - m[:, None])
-            # compute dv
-            do = tl.load(do_ptrs)
-            dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-            # compute dp = dot(v, do)
-            Di = tl.load(D_ptrs + offs_m_curr)
-            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
-            dp += tl.dot(do, v, trans_b=True)
-            # compute ds = p * (dp - delta[:, None])
-            ds = p * dp * sm_scale
-            # compute dk = dot(ds.T, q)
-            dk += tl.dot(ds.to(q.dtype), q, trans_a=True)
-            # # compute dq
-            dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-            dq += tl.dot(ds.to(k.dtype), k)
-            tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            # # increment pointers
-            dq_ptrs += BLOCK_M * stride_qm
-            q_ptrs += BLOCK_M * stride_qm
-            do_ptrs += BLOCK_M * stride_qm
-        # write-back
-        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
-        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
-        tl.store(dv_ptrs, dv)
-        tl.store(dk_ptrs, dk)
-
-
-class _attention(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, sm_scale):
-        BLOCK = 128
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        assert Lk in {16, 32, 64, 128}
-        o = torch.empty_like(q)
-        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
-        tmp = torch.empty(
-            (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
-        )
-        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
-        num_warps = 4 if Lk <= 64 else 8
-
-        _fwd_kernel[grid](
-            q,
-            k,
-            v,
-            sm_scale,
-            tmp,
-            L,
-            m,
-            o,
-            q.stride(0),
-            q.stride(1),
-            q.stride(2),
-            q.stride(3),
-            k.stride(0),
-            k.stride(1),
-            k.stride(2),
-            k.stride(3),
-            v.stride(0),
-            v.stride(1),
-            v.stride(2),
-            v.stride(3),
-            o.stride(0),
-            o.stride(1),
-            o.stride(2),
-            o.stride(3),
-            q.shape[0],
-            q.shape[1],
-            q.shape[2],
-            BLOCK_M=BLOCK,
-            BLOCK_N=BLOCK,
-            BLOCK_DMODEL=Lk,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-        ctx.save_for_backward(q, k, v, o, L, m)
-        ctx.BLOCK = BLOCK
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = Lk
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, l, m = ctx.saved_tensors
-        do = do.contiguous()
-        dq = torch.zeros_like(q, dtype=torch.float32)
-        dk = torch.empty_like(k)
-        dv = torch.empty_like(v)
-        do_scaled = torch.empty_like(do)
-        delta = torch.empty_like(l)
-        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)](
-            o,
-            do,
-            l,
-            do_scaled,
-            delta,
-            BLOCK_M=ctx.BLOCK,
-            D_HEAD=ctx.BLOCK_DMODEL,
-        )
-
-        # NOTE: kernel currently buggy for other values of `num_warps`
-        num_warps = 8
-        _bwd_kernel[(ctx.grid[1],)](
-            q,
-            k,
-            v,
-            ctx.sm_scale,
-            o,
-            do_scaled,
-            dq,
-            dk,
-            dv,
-            l,
-            m,
-            delta,
-            q.stride(0),
-            q.stride(1),
-            q.stride(2),
-            q.stride(3),
-            k.stride(0),
-            k.stride(1),
-            k.stride(2),
-            k.stride(3),
-            v.stride(0),
-            v.stride(1),
-            v.stride(2),
-            v.stride(3),
-            q.shape[0],
-            q.shape[1],
-            q.shape[2],
-            ctx.grid[0],
-            BLOCK_M=ctx.BLOCK,
-            BLOCK_N=ctx.BLOCK,
-            BLOCK_DMODEL=ctx.BLOCK_DMODEL,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-        return dq.to(q.dtype), dk, dv, None
-
-
-attention = _attention.apply
--- a/flash_attn/flash_blocksparse_attention.py
+++ b/flash_attn/flash_blocksparse_attention.py
-import math
-
-import hydra
-import torch
-import torch.nn as nn
-from einops import rearrange
-
-from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
-from flash_attn.flash_blocksparse_attn_interface import (
-    convert_blockmask,
-    flash_blocksparse_attn_func,
-)
-
-
-class FlashBlocksparseAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_temp: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.1)
-    """
-
-    def __init__(
-        self,
-        sparsity_config,
-        softmax_temp=None,
-        attention_dropout=0.0,
-        max_seq_length=2048,
-        device=None,
-        dtype=None,
-    ):
-        super().__init__()
-        self.sparsity_config = hydra.utils.instantiate(sparsity_config)
-        self.softmax_temp = softmax_temp
-        self.dropout_p = attention_dropout
-
-        # initialize sparse layout and register as buffer
-        max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256
-        layout = self.sparsity_config.make_layout(max_seq_length)
-        self.register_buffer("layout", layout)
-        blockmask_converted = convert_blockmask(self.layout, causal=False)
-        self.register_buffer("blockmask_converted", blockmask_converted)
-        # logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}')
-
-    def forward(
-        self,
-        qkv,
-        attn_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        cu_seqlens=None,
-        max_s=None,
-        need_weights=False,
-        convert_mask=True,
-    ):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
-            attn_mask: An implementation of BaseMask that encodes where each
-                       query can attend to
-            key_padding_mask: An implementation of BaseMask that encodes how
-                         many query each sequence in the batch consists of
-        """
-        assert not need_weights
-        assert attn_mask is None
-        assert qkv.dtype == torch.float16
-        assert qkv.is_cuda
-
-        if cu_seqlens is None:
-            batch_size = qkv.shape[0]
-            seqlen = qkv.shape[1]
-            # Convert mask to take a subset
-            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
-            assert seqlen_rounded // 16 <= self.layout.shape[0], (
-                seqlen_rounded // 256 <= self.layout.shape[1]
-            )
-            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
-            if key_padding_mask is None:
-                qkv = rearrange(qkv, "b s ... -> (b s) ...")
-                max_s = seqlen
-                cu_seqlens = torch.arange(
-                    0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
-                )
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-            else:
-                key_padding_mask_bool = key_padding_mask.bool_matrix
-                nheads = qkv.shape[-2]
-                x = rearrange(qkv, "b s three h d -> b s (three h d)")
-                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool)
-                x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
-                output_unpad = flash_blocksparse_attn_func(
-                    x_unpad,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-                output = rearrange(
-                    pad_input(
-                        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
-                    ),
-                    "b s (h d) -> b s h d",
-                    h=nheads,
-                )
-        else:
-            assert max_s is not None
-            seqlen = max_s
-            # Convert mask to take a subset
-            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
-            assert seqlen_rounded // 16 <= self.layout.shape[0], (
-                seqlen_rounded // 256 <= self.layout.shape[1]
-            )
-            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
-            if convert_mask:
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    blockmask,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                )
-            else:
-                output = flash_blocksparse_attn_func(
-                    qkv,
-                    cu_seqlens,
-                    self.blockmask_converted,
-                    self.dropout_p if self.training else 0.0,
-                    max_s,
-                    softmax_scale=self.softmax_temp,
-                    causal=causal,
-                    convert_mask=False,
-                )
-
-        return output, None
-
-
-class FlashBlocksparseMHA(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        sparsity_config,
-        bias=True,
-        batch_first=True,
-        attention_dropout=0.0,
-        causal=False,
-        max_seq_length=2048,
-        device=None,
-        dtype=None,
-        **kwargs,
-    ) -> None:
-        assert batch_first
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.causal = causal
-
-        self.num_heads = num_heads
-        assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
-        self.head_dim = self.embed_dim // num_heads
-        assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
-
-        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
-        self.inner_attn = FlashBlocksparseAttention(
-            sparsity_config,
-            attention_dropout=attention_dropout,
-            max_seq_length=max_seq_length,
-            **factory_kwargs,
-        )
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
-
-    def forward(
-        self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False
-    ):
-        qkv = self.Wqkv(x)
-        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
-        context, attn_weights = self.inner_attn(
-            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
-        )
-        return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights
--- a/flash_attn/flash_blocksparse_attn_interface.py
+++ b/flash_attn/flash_blocksparse_attn_interface.py
-# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py
-import flash_attn_cuda
-import torch
-import torch.nn as nn
-
-
-def convert_blockmask(blockmask, causal):
-    """Convert from the 0-1 format to the format used by the CUDA code.
-    0 means the block is skipped.
-    nonzero means the block is not skipped.
-    Argument:
-        blockmask: (row, col): a 0-1 tensor
-    Return:
-        blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
-            indices of the nonzero blocks, padded with -1 to reach length @row.
-            The indices are multiplied by 4, with the smallest bit used to encode whether
-            it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
-            the last nonzero in its row..
-    """
-    assert not causal
-    # TD [2022-05-13]: The indexing and sorting is very tricky
-    nrow, ncol = blockmask.shape
-    # Sort does not support bool on CUDA
-    blockmask = blockmask.to(dtype=torch.uint8)
-    nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True)
-    nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0)
-    last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1]
-    last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
-        torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row
-    ]
-    first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0]
-    first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
-        torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row
-    ]
-    nonzero_idx = nonzero_sorted_rowidx * 4
-    nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2
-    nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1
-    nonzero_idx[nonzero_val == 0] = -1
-    return nonzero_idx.T.contiguous().to(dtype=torch.int32)
-
-
-def _flash_blocksparse_attn_forward(
-    qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax
-):
-    context, softmax_lse, *rest = flash_attn_cuda.fwd_block(
-        qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None
-    )
-    # if context.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    S_dmask = rest[0] if return_softmax else None
-    return context, softmax_lse, S_dmask
-
-
-def _flash_blocksparse_attn_backward(
-    dout,
-    qkv,
-    out,
-    S_dmask,
-    softmax_lse,
-    cu_seqlens,
-    blockmask,
-    dropout_p,
-    max_s,
-    softmax_scale,
-    causal,
-):
-    dqkv, dp, softmax_d = flash_attn_cuda.bwd_block(
-        dout,
-        qkv,
-        out,
-        S_dmask,
-        softmax_lse,
-        cu_seqlens,
-        blockmask,
-        dropout_p,
-        softmax_scale,
-        max_s,
-        causal,
-        None,
-    )
-    # if dqkv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dqkv
-
-
-class FlashBlocksparseAttnFun(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
-            qkv,
-            cu_seqlens,
-            blockmask,
-            dropout_p,
-            max_s,
-            softmax_scale,
-            causal=causal,
-            return_softmax=False,
-        )
-        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_s = max_s
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return context
-
-    @staticmethod
-    def backward(ctx, dout):
-        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        # S_dmask is None, temporarily use another tensor just to get it running
-        dqkv = _flash_blocksparse_attn_backward(
-            dout,
-            qkv,
-            context,
-            context,
-            softmax_lse,
-            cu_seqlens,
-            blockmask,
-            ctx.dropout_p,
-            ctx.max_s,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None, None
-
-
-# We duplicate code to return both the output and the softmax for testing
-# Returning both makes backward a bit slower, so we want to keep using the other version for speed.
-class FlashBlocksparseAttnFunWithS(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
-        # Save rng_state because the backward pass is gonna regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
-            qkv,
-            cu_seqlens,
-            blockmask,
-            dropout_p,
-            max_s,
-            softmax_scale,
-            causal=causal,
-            return_softmax=True,
-        )
-        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_s = max_s
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return context, S_dmask, softmax_lse
-
-    @staticmethod
-    def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored):
-        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dqkv = _flash_blocksparse_attn_backward(
-            dout,
-            qkv,
-            context,
-            S_dmask,
-            softmax_lse,
-            cu_seqlens,
-            blockmask,
-            ctx.dropout_p,
-            ctx.max_s,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_blocksparse_attn_func(
-    qkv,
-    cu_seqlens,
-    blockmask,
-    dropout_p,
-    max_s,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-    convert_mask=True,
-):
-    """dropout_p should be set to 0.0 during evaluation"""
-    func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS
-    if convert_mask:
-        blockmask = convert_blockmask(blockmask, causal=causal)
-    return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal)
--- a/flash_attn/fused_softmax.py
+++ b/flash_attn/fused_softmax.py
-# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py
-# for benchmarking.
-# We added support for seqlen=2k and seqlen=4k
-
-# coding=utf-8
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from apex._autocast_utils import _cast_if_autocast_enabled
-from apex.transformer.enums import AttnMaskType
-from fused_softmax_lib import (
-    scaled_masked_softmax_backward,
-    scaled_masked_softmax_forward,
-    scaled_masked_softmax_get_batch_per_block,
-    scaled_upper_triang_masked_softmax_backward,
-    scaled_upper_triang_masked_softmax_forward,
-)
-
-
-class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply upper triangular mask (typically used in gpt models).
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_upper_triang_masked_softmax_backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-        return input_grads, None
-
-
-def scaled_upper_triang_masked_softmax(inputs, _, scale):
-    b, np, sq, sk = inputs.size()
-    assert sq == sk, "causal mask is only for self attention"
-    # Reshaping input to 3D tensor (attn_batches, sq, sk)
-    inputs = inputs.view(-1, sq, sk)
-    args = _cast_if_autocast_enabled(inputs, scale)
-    with torch.cuda.amp.autocast(enabled=False):
-        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
-    return probs.view(b, np, sq, sk)
-
-
-# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
-# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
-# So I needed to manually write two `torch.autograd.Function` inheritances.
-# Fused operation which performs following three operations in sequence
-# 1. Scale the tensor.
-# 2. Apply the mask.
-# 3. Perform softmax.
-class ScaledMaskedSoftmax(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, inputs, mask, scale):
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
-        return input_grads, None, None
-
-
-def scaled_masked_softmax(inputs, mask, scale):
-    # input is 4D tensor (b, np, sq, sk)
-    args = _cast_if_autocast_enabled(inputs, mask, scale)
-    with torch.cuda.amp.autocast(enabled=False):
-        return ScaledMaskedSoftmax.apply(*args)
-
-
-class FusedScaleMaskSoftmax(torch.nn.Module):
-    """
-    fused operation: scaling + mask + softmax
-
-    Arguments:
-        input_in_fp16: flag to indicate if input in fp16 data format.
-        input_in_bf16: flag to indicate if input in bf16 data format.
-        attn_mask_type: attention mask type (pad or causal)
-        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
-        mask_func: mask function to be applied.
-        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-        scale: scaling factor used in input tensor scaling.
-    """
-
-    def __init__(
-        self,
-        input_in_fp16,
-        input_in_bf16,
-        attn_mask_type,
-        scaled_masked_softmax_fusion,
-        mask_func,
-        softmax_in_fp32,
-        scale,
-    ):
-        super().__init__()
-        self.input_in_fp16 = input_in_fp16
-        self.input_in_bf16 = input_in_bf16
-        if self.input_in_fp16 and self.input_in_bf16:
-            raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
-        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
-        self.attn_mask_type = attn_mask_type
-        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
-        self.mask_func = mask_func
-        self.softmax_in_fp32 = softmax_in_fp32
-        self.scale = scale
-
-        if not (self.scale is None or softmax_in_fp32):
-            raise RuntimeError("softmax should be in fp32 when scaled")
-
-        if self.scaled_masked_softmax_fusion:
-            if self.attn_mask_type == AttnMaskType.causal:
-                self.fused_softmax_func = scaled_upper_triang_masked_softmax
-            elif self.attn_mask_type == AttnMaskType.padding:
-                self.fused_softmax_func = scaled_masked_softmax
-            else:
-                raise ValueError("Invalid attn_mask_type.")
-
-    def forward(self, input, mask):
-        # [b, np, sq, sk]
-        assert input.dim() == 4
-
-        if self.is_kernel_available(mask, *input.size()):
-            return self.forward_fused_softmax(input, mask)
-        else:
-            return self.forward_torch_softmax(input, mask)
-
-    def is_kernel_available(self, mask, b, np, sq, sk):
-        attn_batches = b * np
-
-        if (
-            self.scaled_masked_softmax_fusion  # user want to fuse
-            and self.input_in_float16  # input must be fp16
-            and (
-                self.attn_mask_type == AttnMaskType.causal
-                or (self.attn_mask_type == AttnMaskType.padding and mask is not None)
-            )
-            and 16 < sk <= 8192  # sk must be 16 ~ 8192
-            and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4
-            and attn_batches % 4 == 0  # np * b must be divisor of 4
-        ):
-            if 0 <= sk <= 8192:
-                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
-
-                if self.attn_mask_type == AttnMaskType.causal:
-                    if attn_batches % batch_per_block == 0:
-                        return True
-                else:
-                    if sq % batch_per_block == 0:
-                        return True
-        return False
-
-    def forward_fused_softmax(self, input, mask):
-        # input.shape = [b, np, sq, sk]
-        scale = self.scale if self.scale is not None else 1.0
-        return self.fused_softmax_func(input, mask, scale)
-
-    def forward_torch_softmax(self, input, mask):
-        if self.input_in_float16 and self.softmax_in_fp32:
-            input = input.float()
-
-        if self.scale is not None:
-            input = input * self.scale
-        mask_output = self.mask_func(input, mask) if mask is not None else input
-        probs = torch.nn.Softmax(dim=-1)(mask_output)
-
-        if self.input_in_float16 and self.softmax_in_fp32:
-            if self.input_in_fp16:
-                probs = probs.half()
-            else:
-                probs = probs.bfloat16()
-
-        return probs
-
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np)
--- a/flash_attn/layers/__init__.py
+++ b/flash_attn/layers/__init__.py
--- a/flash_attn/layers/patch_embed.py
+++ b/flash_attn/layers/patch_embed.py
-# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py
-# But we use nn.Linear instead of Conv2d and it's about 8x faster.
-
-from functools import partial
-
-import torch.nn as nn
-from einops import rearrange
-from torch import _assert
-from torch.nn.modules.utils import _pair
-
-try:
-    from flash_attn.ops.fused_dense import FusedDense
-except ImportError:
-    FusedDense = None
-
-
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-        flatten=True,
-        bias=True,
-        fused_bias_fc=False,
-    ):
-        super().__init__()
-        img_size = _pair(img_size)
-        patch_size = _pair(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-
-        linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense
-        self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        _, _, H, W = x.shape
-        _assert(
-            H == self.img_size[0],
-            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
-        )
-        _assert(
-            W == self.img_size[1],
-            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
-        )
-        x = self.proj(
-            rearrange(
-                x,
-                "b c (h p1) (w p2) -> b h w (c p1 p2)",
-                p1=self.patch_size[0],
-                p2=self.patch_size[1],
-            )
-        )
-        if self.flatten:
-            x = rearrange(x, "b h w c -> b (h w) c")
-        x = self.norm(x)
-        return x
--- a/flash_attn/layers/rotary.py
+++ b/flash_attn/layers/rotary.py
-# Copyright (c) 2023, Tri Dao.
-
-import math
-from typing import Optional, Tuple, Union
-
-import torch
-from einops import rearrange, repeat
-from flash_attn.ops.triton.rotary import apply_rotary
-
-
-def rotate_half(x, interleaved=False):
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
-
-
-def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    return torch.cat(
-        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
-        dim=-1,
-    )
-
-
-class ApplyRotaryEmb(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        cos,
-        sin,
-        interleaved=False,
-        inplace=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
-        out = apply_rotary(
-            x,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-            interleaved=interleaved,
-            inplace=inplace,
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        ctx.inplace = inplace
-        ctx.max_seqlen = max_seqlen
-        return out if not inplace else x
-
-    @staticmethod
-    def backward(ctx, do):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cu_seqlens = ctx.saved_tensors
-        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
-        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
-        if not ctx.interleaved and not ctx.inplace:
-            do = do.clone()
-        dx = apply_rotary(
-            do,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-            interleaved=ctx.interleaved,
-            inplace=ctx.inplace,
-            conjugate=True,
-        )
-        return dx, None, None, None, None, None, None, None
-
-
-def apply_rotary_emb(
-    x,
-    cos,
-    sin,
-    interleaved=False,
-    inplace=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[int] = None,
-):
-    """
-    Arguments:
-        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-        cos, sin: (seqlen_rotary, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        inplace: if True, apply rotary embedding in-place.
-        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding to the first rotary_dim of x.
-    """
-    return ApplyRotaryEmb.apply(
-        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
-    )
-
-
-# For backward compatibility
-apply_rotary_emb_func = apply_rotary_emb
-
-
-class ApplyRotaryEmbQKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cos,
-        sin,
-        cos_k=None,
-        sin_k=None,
-        interleaved=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-    ):
-        batch, seqlen, three, nheads, headdim = qkv.shape
-        assert three == 3
-        if cos_k is None and sin_k is None and qkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            # qk = rearrange(qkv[:, :, :2], "b s t h d -> b s (t h) d")
-            qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
-            apply_rotary(
-                qk, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            q, k = qkv[:, :, 0], qkv[:, :, 1]
-            apply_rotary(q, cos, sin, seqlen_offsets, interleaved=interleaved, inplace=True)
-            apply_rotary(k, cos_k, sin_k, seqlen_offsets, interleaved=interleaved, inplace=True)
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cos_k, sin_k)
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cos_k, sin_k, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        return qkv
-
-    @staticmethod
-    def backward(ctx, dqkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cos_k, sin_k, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cos_k, sin_k = ctx.saved_tensors
-        if cos_k is None and sin_k is None and dqkv.is_contiguous():
-            # Call 1 kernel instead of 2 kernels
-            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
-            # dimensions, we get the same tensor
-            dqk = rearrange(dqkv[:, :, :2], "b s t h d -> b s (t h) d")
-            apply_rotary(
-                dqk,
-                cos,
-                sin,
-                seqlen_offsets=seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-            )
-        else:
-            cos_k = cos if cos_k is None else cos_k
-            sin_k = sin if sin_k is None else sin_k
-            dq, dk = dqkv[:, :, 0], dqkv[:, :, 1]
-            apply_rotary(
-                dq, cos, sin, seqlen_offsets, interleaved=ctx.interleaved, inplace=True, conjugate=True
-            )
-            apply_rotary(
-                dk,
-                cos_k,
-                sin_k,
-                seqlen_offsets,
-                interleaved=ctx.interleaved,
-                inplace=True,
-                conjugate=True,
-            )
-        return dqkv, None, None, None, None, None, None
-
-
-def apply_rotary_emb_qkv_(
-    qkv,
-    cos,
-    sin,
-    cos_k=None,
-    sin_k=None,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-):
-    """
-    Arguments:
-        qkv: (batch_size, seqlen, 3, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-    Return:
-        qkv: (batch_size, seqlen, 3, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
-    """
-    return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
-
-
-class ApplyRotaryEmbKV_(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0):
-        batch, seqlen, two, nheads, headdim = kv.shape
-        assert two == 2
-        k = kv[:, :, 0]
-        apply_rotary(
-            k, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        return kv
-
-    @staticmethod
-    def backward(ctx, dkv):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin = ctx.saved_tensors
-        apply_rotary(
-            dkv[:, :, 0],
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            interleaved=ctx.interleaved,
-            inplace=True,
-            conjugate=True,
-        )
-        return dkv, None, None, None, None
-
-
-apply_rotary_emb_kv_ = ApplyRotaryEmbKV_.apply
-
-
-def apply_rotary_emb_kv_(
-    kv,
-    cos,
-    sin,
-    interleaved=False,
-    seqlen_offsets: Union[int, torch.Tensor] = 0,
-):
-    """
-    Arguments:
-        kv: (batch_size, seqlen, 2, nheads, headdim)
-        cos, sin: (seqlen, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
-            1st half and 2nd half (GPT-NeoX style).
-        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-    Return:
-        kv: (batch_size, seqlen, 2, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding *inplace* to the first rotary_dim of K.
-    """
-    return ApplyRotaryEmbKV_.apply(kv, cos, sin, interleaved, seqlen_offsets)
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """
-    The rotary position embeddings from RoFormer_ (Su et. al).
-    A crucial insight from the method is that the query and keys are
-    transformed by rotation matrices which depend on the relative positions.
-
-    Other implementations are available in the Rotary Transformer repo_ and in
-    GPT-NeoX_, GPT-NeoX was an inspiration
-
-    .. _RoFormer: https://arxiv.org/abs/2104.09864
-    .. _repo: https://github.com/ZhuiyiTechnology/roformer
-    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
-
-    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
-    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
-    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        base=10000.0,
-        interleaved=False,
-        scale_base=None,
-        pos_idx_in_fp32=True,
-        device=None,
-    ):
-        """
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
-            otherwise they might be in lower precision.
-            This option was added because previously (before 2023-07-02), when we construct
-            the position indices, we use the dtype of self.inv_freq. In most cases this would
-            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
-            self.inv_freq would be bf16, and the position indices are also in bf16.
-            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
-            embeddings for some positions will coincide.
-            To maintain compatibility with models previously trained in pure bf16,
-            we add this option.
-        """
-        super().__init__()
-        self.dim = dim
-        self.base = float(base)
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.interleaved = interleaved
-        self.scale_base = scale_base
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-
-    def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-            self.base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
-
-    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
-        # Reset the tables if the sequence length has changed,
-        # if we're on a new device (possibly due to tracing for instance),
-        # or if we're switching from inference mode to training
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached is None
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-            or (self.training and self._cos_cached.is_inference())
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    inv_freq = self._compute_inv_freq(device=device)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            # Don't do einsum, it converts fp32 to fp16 under AMP
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, inv_freq)
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-            else:
-                power = (
-                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
-                    - seqlen // 2
-                ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-
-    def forward(
-        self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: Union[int, torch.Tensor] = 0,
-        max_seqlen: Optional[int] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
-             else it's just q of shape (batch, seqlen, nheads, headdim)
-        kv: (batch, seqlen, 2, nheads, headdim)
-        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
-            should pass in max_seqlen, which will update the cos / sin cache up to that length.
-        Apply rotary embedding *inplace* to qkv and / or kv.
-        """
-        seqlen = qkv.shape[1]
-        if max_seqlen is not None:
-            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
-        elif isinstance(seqlen_offset, int):
-            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
-        if kv is None:
-            if self.scale is None:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            else:
-                return apply_rotary_emb_qkv_(
-                    qkv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-        else:
-            q = qkv
-            q = apply_rotary_emb_func(
-                q,
-                self._cos_cached,
-                self._sin_cached,
-                interleaved=self.interleaved,
-                inplace=True,
-                seqlen_offsets=seqlen_offset,
-            )
-            if self.scale is None:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_cached,
-                    self._sin_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            else:
-                kv = apply_rotary_emb_kv_(
-                    kv,
-                    self._cos_k_cached,
-                    self._sin_k_cached,
-                    interleaved=self.interleaved,
-                    seqlen_offsets=seqlen_offset,
-                )
-            return q, kv
--- a/flash_attn/losses/__init__.py
+++ b/flash_attn/losses/__init__.py
--- a/flash_attn/losses/cross_entropy.py
+++ b/flash_attn/losses/cross_entropy.py
-# Copyright (c) 2023, Tri Dao.
-
-import torch
-import torch.nn as nn
-
-from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
-
-
-class CrossEntropyLoss(nn.Module):
-    def __init__(
-        self,
-        ignore_index=-100,
-        reduction="mean",
-        label_smoothing=0.0,
-        logit_scale=1.0,
-        lse_square_scale=0.0,
-        inplace_backward=False,
-        process_group=None,
-        return_z_loss=False,
-    ):
-        """
-        Arguments:
-            ignore_index: int. If labels == ignore_index, the loss is set to 0.0.
-            label_smoothing: float
-            lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
-                This is also referred to as "z-loss".
-            inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
-                This saves memory.
-            process_group: if not None, we're doing Tensor Parallel: each process is responsible for
-                one part of the vocab. The loss will be aggregated across processes.
-            return_z_loss: bool. If True, we return the component of the loss contributed by
-                the lse_square_scale value. This value is only for logging and does not support
-                backprop.
-        """
-        super().__init__()
-        if reduction not in ["mean", "none", "sum"]:
-            raise NotImplementedError("Only support reduction = 'mean' or 'none' or 'sum'")
-        self.ignore_index = ignore_index
-        self.reduction = reduction
-        self.label_smoothing = label_smoothing
-        self.logit_scale = logit_scale
-        self.lse_square_scale = lse_square_scale
-        self.inplace_backward = inplace_backward
-        self.process_group = process_group
-        self.return_z_loss = return_z_loss
-
-    def forward(self, input, target):
-        """
-        Arguments:
-            input: (batch, vocab_size)
-            target: (batch,)
-        Returns:
-            losses: (batch,) if reduction is 'none', else (1,), dtype float
-            z_loss: (batch,) if reduction is 'none', else (1,), dtype float (if self.return_z_loss)
-        """
-        assert input.is_cuda and target.is_cuda, "Only support CUDA tensors"
-        loss, z_loss = cross_entropy_loss(
-            input,
-            target,
-            label_smoothing=self.label_smoothing,
-            logit_scale=self.logit_scale,
-            lse_square_scale=self.lse_square_scale,
-            ignore_index=self.ignore_index,
-            inplace_backward=self.inplace_backward,
-            process_group=self.process_group,
-        )
-        if self.reduction == "mean":
-            loss = loss.sum() / (target != self.ignore_index).sum()
-        elif self.reduction == "sum":
-            loss = loss.sum()
-        else:
-            loss = loss
-
-        if not self.return_z_loss:
-            return loss
-
-        if self.reduction == "mean":
-            z_loss = z_loss.sum() / (target != self.ignore_index).sum()
-        elif self.reduction == "sum":
-            z_loss = z_loss.sum()
-        else:
-            z_loss = z_loss
-
-        return loss, z_loss
--- a/flash_attn/models/__init__.py
+++ b/flash_attn/models/__init__.py
--- a/flash_attn/models/baichuan.py
+++ b/flash_attn/models/baichuan.py
-# Copyright (c) 2023, GGGGGGXY, Tri Dao.
-
-import math
-import json
-import re
-from pathlib import Path
-
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-
-from einops import rearrange
-from transformers import GPT2Config, AutoConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_baichuan(state_dict, config):
-    def key_mapping_layers(key):
-        return re.sub(r"^model.", "transformer.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-
-    # Word embedding
-    def key_mapping_emb(key):
-        return re.sub(
-            r"^transformer.embed_tokens.",
-            "transformer.embeddings.word_embeddings.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = (
-        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple)
-        * pad_vocab_size_multiple
-    )
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    if getattr(config, "tie_word_embeddings"):
-        state_dict["lm_head.weight"] = state_dict[
-            "transformer.embeddings.word_embeddings.weight"
-        ]
-    else:
-        output_embeddings = state_dict.pop("lm_head.weight")
-        # Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings
-        # differently.
-        vocab_size = (
-            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
-            * pad_vocab_size_multiple
-        )
-        # It's possible that vocab_size is padded to be a multiple of 8, for example.
-        state_dict["lm_head.weight"] = F.pad(
-            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
-        )
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).input_layernorm.",
-            r"transformer.layers.\1.norm1.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).post_attention_layernorm.",
-            r"transformer.layers.\1.norm2.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    for l in range(config.n_layer):
-        w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight")
-        w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight")
-        # Our ordering is different
-        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat(
-            [w3, w1], dim=0
-        )
-
-    def key_mapping_mlp(key):
-        return re.sub(
-            r"^transformer.layers.(\d+).mlp.down_proj.",
-            r"transformer.layers.\1.mlp.fc2.",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attn.W_pack.",
-            r"transformer.layers.\1.mixer.Wqkv.",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).self_attn.o_proj.",
-            r"transformer.layers.\1.mixer.out_proj.",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    for l in range(config.n_layer):
-        # pop rotary_emb.inv_freq from state dict
-        state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None)
-    return state_dict
-
-
-def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config:
-    # HACK: the config doesn't have say whether it's rotary or alibi.
-    # So we have to infer from the hidden size (7B -> rotary, 13B -> alibi).
-    # HACK: the config doesn't have say whether it uses norm head.
-    # So we have to infer from the vocab size
-    # (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head).
-    use_rotary = baichuan_config.hidden_size < 5000
-    return GPT2Config(
-        vocab_size=baichuan_config.vocab_size,
-        n_positions=0,  # No absolute position embedding
-        n_embd=baichuan_config.hidden_size,
-        n_layer=baichuan_config.num_hidden_layers,
-        n_head=baichuan_config.num_attention_heads,
-        n_inner=baichuan_config.intermediate_size,
-        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
-        # baichuan doesn't have dropout, idk if it's because they only release the inference code
-        resid_pdrop=0.0,
-        embd_pdrop=0.0,
-        attn_pdrop=0.0,
-        layer_norm_epsilon=baichuan_config.rms_norm_eps,
-        initializer_range=baichuan_config.initializer_range,
-        bos_token_id=baichuan_config.bos_token_id,
-        eos_token_id=baichuan_config.eos_token_id,
-        # These are new arguments not in the original GPT2Config
-        pad_token_id=baichuan_config.pad_token_id,  # Idk if this does anything
-        rms_norm=True,
-        rotary_emb_fraction=1.0 if use_rotary else 0.0,
-        rotary_emb_interleaved=False,
-        use_alibi=not use_rotary,
-        use_flash_attn=not use_rotary,  # Alibi code path requires flash_attn
-        tie_word_embeddings=False,
-        norm_head=baichuan_config.vocab_size > 70000,
-        qkv_proj_bias=False,
-        out_proj_bias=False,
-        mlp_fc1_bias=False,
-        mlp_fc2_bias=False,
-    )
--- a/flash_attn/models/bert.py
+++ b/flash_attn/models/bert.py
-# Copyright (c) 2022, Tri Dao.
-# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
-# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
-# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
-
-# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
-
-import logging
-import re
-from collections import OrderedDict
-from collections.abc import Sequence
-from functools import partial
-from typing import Any, Mapping
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import BertConfig, PretrainedConfig
-from transformers.models.bert.modeling_bert import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    BertForPreTrainingOutput,
-)
-
-from flash_attn.bert_padding import (
-    index_first_axis,
-    index_first_axis_residual,
-    pad_input,
-    unpad_input,
-)
-from flash_attn.modules.block import Block
-from flash_attn.modules.embedding import BertEmbeddings
-from flash_attn.modules.mha import MHA
-from flash_attn.modules.mlp import FusedMLP, Mlp
-from flash_attn.utils.pretrained import state_dict_from_pretrained
-
-try:
-    from flash_attn.ops.fused_dense import FusedDense
-except ImportError:
-    FusedDense = None
-
-try:
-    from flash_attn.ops.triton.layer_norm import layer_norm_fn
-except ImportError:
-    layer_norm_fn = None
-
-
-try:
-    from flash_attn.losses.cross_entropy import CrossEntropyLoss
-except ImportError:
-    CrossEntropyLoss = None
-
-
-logger = logging.getLogger(__name__)
-
-
-def create_mixer_cls(config, cross_attn=False, return_residual=False):
-    use_flash_attn = getattr(config, "use_flash_attn", False)
-    fused_bias_fc = getattr(config, "fused_bias_fc", False)
-    rotary_kwargs = {}
-    if config.position_embedding_type == "rotary":
-        rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size)
-        rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
-        rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None)
-        rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False)
-    mixer_cls = partial(
-        MHA,
-        num_heads=config.num_attention_heads,
-        cross_attn=cross_attn,
-        dropout=config.attention_probs_dropout_prob,
-        causal=False,
-        fused_bias_fc=fused_bias_fc,
-        use_flash_attn=use_flash_attn,
-        return_residual=return_residual,
-        **rotary_kwargs,
-    )
-    return mixer_cls
-
-
-def create_mlp_cls(config, layer_idx=None, return_residual=False):
-    inner_dim = config.intermediate_size
-    fused_mlp = getattr(config, "fused_mlp", False)
-    if fused_mlp:
-        assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
-            "fused_mlp only " "supports approximate gelu"
-        )
-    if not fused_mlp:
-        approximate = (
-            "tanh"
-            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
-        mlp_cls = partial(
-            Mlp,
-            hidden_features=inner_dim,
-            activation=partial(F.gelu, approximate=approximate),
-            return_residual=return_residual,
-        )
-    else:
-        if FusedMLP is None:
-            raise ImportError("fused_dense is not installed")
-        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
-        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
-        if isinstance(mlp_checkpoint_lvl, Sequence):
-            assert layer_idx is not None
-            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
-        mlp_cls = partial(
-            FusedMLP,
-            hidden_features=inner_dim,
-            checkpoint_lvl=mlp_checkpoint_lvl,
-            return_residual=return_residual,
-        )
-    return mlp_cls
-
-
-def create_block(config, layer_idx=None):
-    last_layer_subset = getattr(config, "last_layer_subset", False)
-    cross_attn = last_layer_subset and layer_idx == config.num_hidden_layers - 1
-    # TD [2022-12-19]: For cross attention (last layer), we actually want to return the
-    # residual x_kv, not residual x. But it's annoying to change the API (and it only affects
-    # one layer) so we just choose not to return residual in this case.
-    return_residual = not cross_attn
-    mixer_cls = create_mixer_cls(config, cross_attn, return_residual=return_residual)
-    mlp_cls = create_mlp_cls(config, layer_idx, return_residual=return_residual)
-    norm_cls = partial(nn.LayerNorm, eps=config.layer_norm_eps)
-    block = Block(
-        config.hidden_size,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        prenorm=False,
-        resid_dropout1=config.hidden_dropout_prob,
-        resid_dropout2=config.hidden_dropout_prob,
-        fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
-        return_residual=return_residual,
-    )
-    return block
-
-
-# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
-def _init_weights(module, initializer_range=0.02):
-    if isinstance(module, nn.Linear):
-        nn.init.normal_(module.weight, std=initializer_range)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-        if module.padding_idx is not None:
-            nn.init.zeros_(module.weight[module.padding_idx])
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config: BertConfig):
-        super().__init__()
-        self.use_flash_attn = getattr(config, "use_flash_attn", False)
-        self.layers = nn.ModuleList(
-            [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
-        )
-
-    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
-        """If subset_mask is not None, we only want output for the subset of the sequence.
-        This means that we only compute the last layer output for these tokens.
-        subset_mask: (batch, seqlen), dtype=torch.bool
-        """
-        if key_padding_mask is None or not self.use_flash_attn:
-            mixer_kwargs = (
-                {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
-            )
-            for layer in self.layers:
-                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-            if subset_mask is not None:
-                hidden_states = hidden_states[subset_mask]
-        else:
-            batch, seqlen = hidden_states.shape[:2]
-            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-                hidden_states, key_padding_mask
-            )
-            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
-            if subset_mask is None:
-                for layer in self.layers:
-                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                hidden_states = pad_input(hidden_states, indices, batch, seqlen)
-            else:
-                for layer in self.layers[:-1]:
-                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                if key_padding_mask is not None:
-                    subset_idx = torch.nonzero(
-                        subset_mask[key_padding_mask], as_tuple=False
-                    ).flatten()
-                    subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32)
-                    subset_cu_seqlens = F.pad(
-                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
-                    )
-                else:
-                    subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
-                    subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
-                    subset_cu_seqlens = F.pad(
-                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
-                    )
-                hidden_states_subset, hidden_states = index_first_axis_residual(
-                    hidden_states, subset_idx
-                )
-                # It's ok to set max_seqlen_q to be much larger
-                mixer_kwargs = {
-                    "x_kv": hidden_states,
-                    "cu_seqlens": subset_cu_seqlens,
-                    "max_seqlen": max_seqlen_in_batch,
-                    "cu_seqlens_k": cu_seqlens,
-                    "max_seqlen_k": max_seqlen_in_batch,
-                }
-                hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
-        return hidden_states
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        fused_bias_fc = getattr(config, "fused_bias_fc", False)
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-        self.dense = linear_cls(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states, pool=True):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0] if pool else hidden_states
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        fused_bias_fc = getattr(config, "fused_bias_fc", False)
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
-        if self.fused_dropout_add_ln and layer_norm_fn is None:
-            raise ImportError("Triton is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-        self.dense = linear_cls(config.hidden_size, config.hidden_size)
-        approximate = (
-            "tanh"
-            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
-        self.transform_act_fn = nn.GELU(approximate=approximate)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        if not self.fused_dropout_add_ln:
-            hidden_states = self.layer_norm(hidden_states)
-        else:
-            hidden_states = layer_norm_fn(
-                hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps
-            )
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        fused_bias_fc = getattr(config, "fused_bias_fc", False)
-        if fused_bias_fc and FusedDense is None:
-            raise ImportError("fused_dense is not installed")
-        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
-
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = linear_cls(config.hidden_size, config.vocab_size, bias=True)
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(nn.Module):
-    """An abstract class to handle weights initialization and
-    a simple interface for dowloading and loading pretrained models.
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
-
-    @classmethod
-    def from_pretrained(cls, model_name, config, *inputs, **kwargs):
-        """
-        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        load_return = model.load_state_dict(
-            remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False
-        )
-        logger.info(load_return)
-        return model
-
-
-class BertModel(BertPreTrainedModel):
-    def __init__(self, config: BertConfig, add_pooling_layer=True):
-        super().__init__(config)
-        self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-        if config.vocab_size % self.pad_vocab_size_multiple != 0:
-            config.vocab_size += self.pad_vocab_size_multiple - (
-                config.vocab_size % self.pad_vocab_size_multiple
-            )
-        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
-        if self.fused_dropout_add_ln and layer_norm_fn is None:
-            raise ImportError("Triton is not installed")
-        assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-
-        self.embeddings = BertEmbeddings(
-            config.hidden_size,
-            config.vocab_size,
-            config.max_position_embeddings,
-            config.type_vocab_size,
-            padding_idx=config.pad_token_id,
-        )
-        self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
-        self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config) if add_pooling_layer else None
-
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        masked_tokens_mask=None,
-    ):
-        """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
-        we only want the output for the masked tokens. This means that we only compute the last
-        layer output for these tokens.
-        masked_tokens_mask: (batch, seqlen), dtype=torch.bool
-        """
-        hidden_states = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
-        )
-        # TD [2022-12:18]: Don't need to force residual in fp32
-        # BERT puts embedding LayerNorm before embedding dropout.
-        if not self.fused_dropout_add_ln:
-            hidden_states = self.emb_ln(hidden_states)
-        else:
-            hidden_states = layer_norm_fn(
-                hidden_states, self.emb_ln.weight, self.emb_ln.bias, eps=self.emb_ln.eps
-            )
-        hidden_states = self.emb_drop(hidden_states)
-
-        if masked_tokens_mask is not None:
-            batch_size, seqlen = input_ids.shape[:2]
-            # We also need the first column for the CLS token
-            first_col_mask = torch.zeros(
-                batch_size, seqlen, dtype=torch.bool, device=input_ids.device
-            )
-            first_col_mask[:, 0] = True
-            subset_mask = masked_tokens_mask | first_col_mask
-        else:
-            subset_mask = None
-
-        sequence_output = self.encoder(
-            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask
-        )
-
-        if masked_tokens_mask is None:
-            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-        else:
-            # TD [2022-03-01]: the indexing here is very tricky.
-            if attention_mask is not None:
-                subset_idx = subset_mask[attention_mask]
-                pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
-                sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]]
-            else:
-                pool_input = sequence_output[first_col_mask[subset_mask]]
-                sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
-            pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
-
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-        )
-
-
-class BertForPreTraining(BertPreTrainedModel):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
-        # (around 15%) to the classifier heads.
-        self.dense_seq_output = getattr(config, "dense_seq_output", False)
-        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
-        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
-        self.last_layer_subset = getattr(config, "last_layer_subset", False)
-        if self.last_layer_subset:
-            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
-        use_xentropy = getattr(config, "use_xentropy", False)
-        if use_xentropy and CrossEntropyLoss is None:
-            raise ImportError("xentropy_cuda is not installed")
-        loss_cls = (
-            nn.CrossEntropyLoss
-            if not use_xentropy
-            else partial(CrossEntropyLoss, inplace_backward=True)
-        )
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-        self.mlm_loss = loss_cls(ignore_index=0)
-        self.nsp_loss = loss_cls(ignore_index=-1)
-
-        # Initialize weights and apply final processing
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tie_weights()
-
-    def tie_weights(self):
-        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
-
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        labels=None,
-        next_sentence_label=None,
-    ):
-        """
-        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
-        mask).
-        Outputs:
-            if `labels` and `next_sentence_label` are not `None`:
-                Outputs the total_loss which is the sum of the masked language modeling loss and the next
-                sentence classification loss.
-            if `labels` or `next_sentence_label` is `None`:
-                Outputs a tuple comprising
-                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-                - the next sentence classification logits of shape [batch_size, 2].
-
-        """
-        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
-        outputs = self.bert(
-            input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask.bool() if attention_mask is not None else None,
-            masked_tokens_mask=masked_tokens_mask,
-        )
-        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
-        if self.dense_seq_output and labels is not None:
-            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
-            if not self.last_layer_subset:
-                sequence_output = index_first_axis(
-                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
-                )
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            if (
-                self.dense_seq_output and labels is not None
-            ):  # prediction_scores are already flattened
-                masked_lm_loss = self.mlm_loss(
-                    prediction_scores, labels.flatten()[masked_token_idx]
-                )
-            else:
-                masked_lm_loss = self.mlm_loss(
-                    rearrange(prediction_scores, "... v -> (...) v"),
-                    rearrange(labels, "... -> (...)"),
-                )
-            next_sentence_loss = self.nsp_loss(
-                rearrange(seq_relationship_score, "... t -> (...) t"),
-                rearrange(next_sentence_label, "... -> (...)"),
-            )
-            total_loss = masked_lm_loss.float() + next_sentence_loss.float()
-
-        return BertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-        )
-
-
-def remap_state_dict(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
-    """
-
-    # LayerNorm
-    def key_mapping_ln_gamma_beta(key):
-        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
-        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
-
-    # Layers
-    def key_mapping_layers(key):
-        return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key)
-
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
-            r"bert.encoder.layers.\1.norm2.\2",
-            key,
-        )
-        key = re.sub(
-            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
-            r"cls.predictions.transform.layer_norm.\1",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mlp.fc2.\2",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    last_layer_subset = getattr(config, "last_layer_subset", False)
-    for d in range(config.num_hidden_layers):
-        Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
-        Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
-        Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
-        bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
-        bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
-        bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
-        if not (last_layer_subset and d == config.num_hidden_layers - 1):
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
-                [Wq, Wk, Wv], dim=0
-            )
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
-        else:
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
-            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
-
-    def key_mapping_attn(key):
-        return re.sub(
-            r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
-            r"bert.encoder.layers.\1.mixer.out_proj.\2",
-            key,
-        )
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    def key_mapping_decoder_bias(key):
-        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
-
-    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
-
-    # Word embedding
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
-        state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
-            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
-        )
-        decoder_weight = state_dict["cls.predictions.decoder.weight"]
-        state_dict["cls.predictions.decoder.weight"] = F.pad(
-            decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
-        )
-        # If the vocab was padded, we want to set the decoder bias for those padded indices to be
-        # strongly negative (i.e. the decoder shouldn't predict those indices).
-        # TD [2022-05-09]: I don't think it affects the MLPerf training.
-        decoder_bias = state_dict["cls.predictions.decoder.bias"]
-        state_dict["cls.predictions.decoder.bias"] = F.pad(
-            decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
-        )
-
-    return state_dict
-
-
-def inv_remap_state_dict(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a flash_attn model to be Huggingface BERT compatible.
-
-    This function is meant to be the inverse of remap_state_dict.
-    """
-    # Word embedding
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
-        decoder_weight = state_dict["cls.predictions.decoder.weight"]
-        decoder_bias = state_dict["cls.predictions.decoder.bias"]
-        # unpad embeddings
-        state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
-            : config.orig_vocab_size, :
-        ]
-        state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :]
-        state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size]
-
-    for d in range(config.num_hidden_layers):
-        last_layer_subset = getattr(config, "last_layer_subset", False)
-        if not last_layer_subset or d != (config.num_hidden_layers - 1):
-            Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
-            Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[
-                : Wqkv_weights.shape[0] // 3, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[
-                Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[
-                2 * Wqkv_weights.shape[0] // 3 :, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[
-                : Wqkv_biases.shape[0] // 3
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[
-                Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[
-                2 * Wqkv_biases.shape[0] // 3 :
-            ]
-        else:
-            Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
-            Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
-            Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
-            Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[
-                : Wkv_weights.shape[0] // 2, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[
-                Wkv_weights.shape[0] // 2 :, :
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
-            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
-                : Wkv_biases.shape[0] // 2
-            ]
-            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[
-                Wkv_biases.shape[0] // 2 :
-            ]
-
-    def inv_key_mapping_ln(key):
-        key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
-        key = re.sub(
-            r"bert.encoder.layers.(\d+).norm1.(weight|bias)",
-            r"bert.encoder.layers.\1.attention.output.LayerNorm.\2",
-            key,
-        )
-        key = re.sub(
-            r"bert.encoder.layers.(\d+).norm2.(weight|bias)",
-            r"bert.encoder.layers.\1.output.LayerNorm.\2",
-            key,
-        )
-        key = re.sub(
-            r"cls.predictions.transform.layer_norm.(weight|bias)",
-            r"cls.predictions.transform.LayerNorm.\1",
-            key,
-        )
-        return key
-
-    def inv_key_mapping_ln_gamma_beta(key):
-        key = re.sub(r"LayerNorm.weight$", "LayerNorm.gamma", key)
-        key = re.sub(r"LayerNorm.bias$", "LayerNorm.beta", key)
-        return key
-
-    def inv_key_mapping_layers(key):
-        return re.sub(r"bert.encoder.layers.", "bert.encoder.layer.", key)
-
-    def inv_key_mapping_mlp(key):
-        key = re.sub(
-            r"bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)",
-            r"bert.encoder.layer.\1.intermediate.dense.\2",
-            key,
-        )
-        key = re.sub(
-            r"bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)",
-            r"bert.encoder.layer.\1.output.dense.\2",
-            key,
-        )
-        return key
-
-    def inv_key_mapping_attn(key):
-        return re.sub(
-            r"bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)",
-            r"bert.encoder.layer.\1.attention.output.dense.\2",
-            key,
-        )
-
-    def inv_key_mapping_decoder_bias(key):
-        return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
-
-    state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items())
-    state_dict = OrderedDict(
-        (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
-    )
-    state_dict = OrderedDict(
-        (inv_key_mapping_layers(key), value) for key, value in state_dict.items()
-    )
-    state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items())
-    state_dict = OrderedDict(
-        (inv_key_mapping_attn(key), value) for key, value in state_dict.items()
-    )
-    state_dict = OrderedDict(
-        (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
-    )
-
-    return state_dict
--- a/flash_attn/models/bigcode.py
+++ b/flash_attn/models/bigcode.py
-import math
-import re
-from collections import OrderedDict
-
-import torch
-import torch.nn.functional as F
-from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig
-
-
-def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
-    """
-
-    # Word embedding and position embedding
-    def key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
-
-    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.wte.weight")
-    # It's possible that vocab_size is padded to be a multiple of 8, for example.
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
-        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
-    )
-    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
-
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.h.(\d+).ln_(1|2).(weight|bias)",
-            r"transformer.layers.\1.norm\2.\3",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_fc.weight",
-            r"transformer.layers.\1.mlp.fc1.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_proj.weight",
-            r"transformer.layers.\1.mlp.fc2.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_fc.bias",
-            r"transformer.layers.\1.mlp.fc1.bias",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).mlp.c_proj.bias",
-            r"transformer.layers.\1.mlp.fc2.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # TODO: add support for multi-head attention
-    assert config.multi_query, "Only multi-query attention is supported"
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        embed_dim = config.n_embd
-        head_dim = embed_dim // config.n_head
-
-        c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
-        # with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim)
-        # see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112
-        # see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183
-        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        k = torch.tile(k, (config.n_head, 1))
-        v = torch.tile(v, (config.n_head, 1))
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0)
-
-        # same deal with the bias
-        c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias")
-        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        k = torch.tile(k, (config.n_head,))
-        v = torch.tile(v, (config.n_head,))
-        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0)
-
-    def key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.weight",
-            r"transformer.layers.\1.mixer.out_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.h.(\d+).attn.c_proj.bias",
-            r"transformer.layers.\1.mixer.out_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
-    """
-    Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.
-
-    This function is meant to be the inverse of remap_state_dict_hf_bigcode.
-    """
-
-    # Word embedding and position embeddings
-    def inv_key_mapping_pos_emb(key):
-        return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key)
-
-    state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items())
-    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
-
-    word_embeddings = word_embeddings[:, : config.vocab_size]
-    state_dict["transformer.wte.weight"] = word_embeddings
-    state_dict["lm_head.weight"] = word_embeddings
-
-    # LayerNorm
-    def inv_key_mapping_ln(key):
-        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
-        key = re.sub(
-            r"^transformer.layers.(\d+).norm(1|2).(weight|bias)",
-            r"transformer.h.\1.ln_\2.\3",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items())
-
-    # MLPs
-    def inv_key_mapping_mlp(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc1.weight",
-            r"transformer.h.\1.mlp.c_fc.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc2.weight",
-            r"transformer.h.\1.mlp.c_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc1.bias",
-            r"transformer.h.\1.mlp.c_fc.bias",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mlp.fc2.bias",
-            r"transformer.h.\1.mlp.c_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items())
-
-    # Attention
-    for d in range(config.num_hidden_layers):
-        embed_dim = config.n_embd
-        head_dim = embed_dim // config.n_head
-
-        Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
-        q, k, v = torch.split(
-            Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
-        )
-        c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
-        state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight
-
-        # Same deal with the bias
-        Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
-        q, k, v = torch.split(
-            Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
-        )
-        c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
-        state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias
-
-    def inv_key_mapping_attn(key):
-        key = re.sub(
-            r"^transformer.layers.(\d+).mixer.out_proj.weight",
-            r"transformer.h.\1.attn.c_proj.weight",
-            key,
-        )
-        key = re.sub(
-            r"^transformer.layers.(\d+).mixer.out_proj.bias",
-            r"transformer.h.\1.attn.c_proj.bias",
-            key,
-        )
-        return key
-
-    state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items())
-
-    return state_dict
-
-
-def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config:
-    return GPT2Config(
-        activation_function=bigcode_config.activation_function,
-        attn_pdrop=bigcode_config.attn_pdrop,
-        bos_token_id=bigcode_config.bos_token_id,
-        embd_pdrop=bigcode_config.embd_pdrop,
-        eos_token_id=bigcode_config.eos_token_id,
-        initializer_range=bigcode_config.initializer_range,
-        layer_norm_epsilon=bigcode_config.layer_norm_epsilon,
-        max_batch_size=bigcode_config.max_batch_size,
-        max_sequence_length=bigcode_config.max_sequence_length,
-        model_type=bigcode_config.model_type,
-        multi_query=bigcode_config.multi_query,
-        n_embd=bigcode_config.n_embd,
-        n_head=bigcode_config.n_head,
-        n_inner=bigcode_config.n_inner,
-        n_layer=bigcode_config.n_layer,
-        n_positions=bigcode_config.n_positions,
-        resid_pdrop=bigcode_config.resid_pdrop,
-        scale_attn_weights=bigcode_config.scale_attn_weights,
-        summary_activation=bigcode_config.summary_activation,
-        summary_first_dropout=bigcode_config.summary_first_dropout,
-        summary_proj_to_labels=bigcode_config.summary_proj_to_labels,
-        summary_type=bigcode_config.summary_type,
-        summary_use_proj=bigcode_config.summary_use_proj,
-        use_cache=bigcode_config.use_cache,
-        vocab_size=bigcode_config.vocab_size,
-    )