FlashAttention-2 release

4f285b35 · Tri Dao · 6d48e14a · 6d48e14a · 6d48e14a · 6d48e14a
Commit 4f285b35 authored Jul 17, 2023 by Tri Dao
20 changed files
--- a/csrc/flash_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
+++ b/csrc/flash_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
-/* Copyright (c) 2022, Tri Dao.
- */
-
-#pragma once
-
-#include "fmha_fprop_kernel_1xN.h"
-#include "fmha_kernel.h"
-#include "fmha_blockmask.h"
-#include <fmha/kernel_traits.h>
-#include <fmha/gemm.h>
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Smem_dp_sum, int M>
-inline __device__ void dot_do_o(float (&sum)[M], const uint4 (&do_)[M], const uint4 (&o)[M],
-                                Smem_dp_sum smem, const int buffer_idx) {
-    #pragma unroll
-    for (int mi = 0; mi < M; ++mi) {
-        sum[mi] = smem.reduce_warp(fmha::hmulsum8<__half>(do_[mi], o[mi]));
-    }
-    static_assert(M == 1);
-    smem.store(sum[0], buffer_idx);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_first, bool Is_last, typename Params, typename Prng>
-inline __device__ void compute_block_dq_dk_dv_1xN_one_iter(const Params &params, Prng &ph,
-                                                     const int loop_step_idx) {
-
-    // The description of the CTA tile for the 1st batched GEMM.
-    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
-    // The description of the CTA tile for the 2nd batched GEMM.
-    using Cta_tile_dq = typename Kernel_traits::Cta_tile_o;
-    // The description of the CTA tile for the 3rd batched GEMM.
-    using Cta_tile_dkv =
-        fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;
-
-    static_assert(Cta_tile_dkv::M == 512 ||  Cta_tile_dkv::M == 256 || Cta_tile_dkv::M == 128);
-    static_assert(Cta_tile_dkv::N == 16 || Cta_tile_dkv::N == 32 || Cta_tile_dkv::N == 64);
-    static_assert(Cta_tile_dkv::K == 16);
-
-    // The MMA tile for the 1st GEMM.
-    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
-    // The MMA tile for the 2nd GEMM.
-    using Mma_tile_dq = fmha::Hmma_tile<Cta_tile_dq>;
-    // The MMA tile for the 3rd GEMM.
-    using Mma_tile_dkv = fmha::Hmma_tile<Cta_tile_dkv>;
-
-    // The global memory tile to load Q.
-    using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
-    // The shared memory tile to reload Q transposed.
-    using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dkv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
-
-    // The global memory tile to load K.
-    using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
-    // The shared memory tile to swizzle K^T. Treat K^T as V
-    using Smem_tile_kt = typename Kernel_traits::Smem_tile_v;
-
-    // Treating V as K. We need to use Kernel_traits::Smem_tile_k otherwise loading will be wrong
-    // The global memory tile to load V.
-    using Gmem_tile_v = typename Kernel_traits::Gmem_tile_k;
-    // The shared memory tile to swizzle V.
-    using Smem_tile_v = typename Kernel_traits::Smem_tile_k;
-
-    // The global memory tile to load dO.
-    using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;
-    // The shared memory tile to load dO.
-    // Treating dO as Q.
-    using Smem_tile_do = typename Kernel_traits::Smem_tile_q;
-    // The shared memory tile to reload dO transposed.
-    using Smem_tile_dot = fmha::Smem_tile_b<Cta_tile_dkv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
-
-    // The global memory tile to load O.Loading O here is similar to loading dO.
-    using Gmem_tile_o = Gmem_tile_do;
-
-    // The global memory tile to store dQ.
-    using Gmem_tile_dq = typename Kernel_traits::Gmem_tile_o;
-    using Gmem_tile_dq_tmp = fmha::Gmem_tile_o<Cta_tile_dq, 4>;
-    // The shared memory tile to swizzle dQ.
-    using Smem_tile_dq = typename Kernel_traits::Smem_tile_o;
-
-    // The global memory tile to store dV.
-    using Gmem_tile_dv = typename Kernel_traits::Gmem_tile_v;
-    // The shared memory tile to swizzle dV.
-    using Smem_tile_dv = fmha::Smem_tile_mma_epilogue<Cta_tile_dkv>;
-
-    // The global memory tile to store dK.
-    using Gmem_tile_dk = typename Kernel_traits::Gmem_tile_v;
-    // The shared memory tile to swizzle dK.
-    using Smem_tile_dk = fmha::Smem_tile_mma_epilogue<Cta_tile_dkv>;
-    static_assert(Smem_tile_dk::NUM_LDS == Gmem_tile_dk::LDGS);
-    static_assert(Smem_tile_dk::THREADS_PER_ROW == Gmem_tile_dk::THREADS_PER_ROW);
-
-    using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
-
-    using Smem_tile_st = typename Kernel_traits::Smem_tile_st;
-
-    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
-
-    using Smem_dp_sum = typename Kernel_traits::Smem_dp_sum;
-
-    // using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
-    using Gemm1 = Gemm_Q_K<Kernel_traits, /*K-in_regs=*/false>;
-
-    using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-    // Shared memory layout if we keep V in registers:
-    //  dO | Q | K / V | dQ | S | dP | dP_sum
-    //  dV | dK
-    // Shared memory layout if we keep V shared memory:
-    //  dO | Q | K | V | dQ | S | dP | dP_sum
-    //  dV | dK
-
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
-    // if( binfo.stop_early() ) return;
-    if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
-
-    Blockmask blockmask(params, loop_step_idx);
-    int block_row_idx = 0;
-    int mask_val = blockmask.mask_val(0);
-    if (mask_val == -1) return;
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("mask_val = %d.\n", mask_val);
-    // }
-
-    Gemm1 gemm_q_k(&smem_[Smem_tile_do::BYTES_PER_TILE], tidx);
-    // Allocate the global memory tile loader for Q.
-    Gmem_tile_q gmem_q(params.q_ptr, params.q_row_stride_in_elts, params.q_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-    // Allocate the global memory tile loader for dQ.
-    Gmem_tile_dq gmem_dq(params.dq_ptr, params.dq_row_stride_in_elts, params.dq_head_stride_in_elts,
-                         params.d, binfo, tidx);
-    Gmem_tile_dq_tmp gmem_dq_tmp(params.o_tmp_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                                 params.d, binfo, tidx);
-    // Allocate the global memory tile loader for S.
-    Gmem_tile_s gmem_s(params, binfo, tidx);
-
-    fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
-
-    // Allocate the global memory tile loader for K.
-    Gmem_tile_k gmem_k(params.k_ptr, params.k_row_stride_in_elts, params.k_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // Allocate the global memory tile loader for V.
-    Gmem_tile_v gmem_v(params.v_ptr, params.v_row_stride_in_elts, params.v_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // The base pointer of smem_v;
-    char *smem_v_ = &smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_V];
-
-    // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
-    Smem_tile_v smem_v(smem_v_, tidx);
-    // Allocate the shared memory tile loader for K^T. We use the same as K so be careful!!!
-    Smem_tile_kt smem_kt(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::Smem_tile_q::BYTES_PER_TILE], tidx);
-
-    // Allocate the global memory tile loader for dO.
-    Gmem_tile_do gmem_do(params.do_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                         params.d, binfo, tidx, true);
-    // Allocate the shared memory tile loader for dO.
-    Smem_tile_do smem_do(&smem_[0], tidx);
-    Smem_tile_dot smem_dot(&smem_[0], tidx);
-    // Allocate the shared memory tile loader for Q^T.
-    // TODO: assert that this points to the same memory as gemm_q_k.smem_q
-    Smem_tile_qt smem_qt(&smem_[Smem_tile_do::BYTES_PER_TILE], tidx);
-
-    Smem_tile_st smem_s(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE], tidx);
-    Smem_tile_st smem_dp(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE + Smem_tile_st::BYTES_PER_TILE], tidx);
-
-    // Allocate the global memory tile loader for O.
-    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-
-    // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
-    Smem_tile_dq smem_dq(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O], tidx);
-
-    Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
-    Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
-
-    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    const int steps = (params.seqlen_q + Cta_tile_p::M - 1) / Cta_tile_p::M;
-
-    // Wind gmem tiles to the correct position.
-    int block_row_idx_next = mask_val / 4;
-    int block_row_idx_to_move = block_row_idx_next - block_row_idx;
-    block_row_idx = block_row_idx_next;
-    gmem_q.move(block_row_idx_to_move);
-    gmem_do.move(block_row_idx_to_move);
-    gmem_o.move(block_row_idx_to_move);
-    gmem_dq.move(block_row_idx_to_move);
-    gmem_dq_tmp.move(block_row_idx_to_move);
-    // TODO: need to move gmem_s if we want the intermediate result for debugging
-    gmem_softmax_lse.move(block_row_idx_to_move);
-    gmem_softmax_d.move(block_row_idx_to_move);
-    block_row_idx = block_row_idx_next;
-
-    if (!Is_first) {
-        gmem_k.move(loop_step_idx);
-        gmem_v.move(loop_step_idx);
-    }
-
-    // Trigger the loads for K.
-    gmem_k.load();
-    // Trigger the loads for Q.
-    gmem_q.load();
-    // Trigger the loads for V.
-    gmem_v.load();
-    // Trigger the loads for dO.
-    gmem_do.load();
-    // Trigger the loads for O.
-    // if (Is_first) { gmem_o.load(); }
-    // if (true) { gmem_o.load(); }
-    if (Is_first || mask_val % 2 == 1) { gmem_o.load(); }
-
-    float p_lse[Mma_tile_p::MMAS_M * 2];
-    gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_lse));
-
-    float dp_sum[Mma_tile_p::MMAS_M * 2];
-    // if (!Is_first) {
-    // if (false) {
-    if (!(Is_first || mask_val % 2 == 1)) {
-        gmem_softmax_d.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(dp_sum));
-    }
-
-    float dp_sum_regs[Gmem_tile_do::LDGS];
-    Smem_dp_sum smem_dp_sum(reinterpret_cast<float *>(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE + Smem_tile_st::BYTES_PER_TILE * 2]), tidx);
-
-    if (!Is_first) { __syncthreads(); }
-    // Commit the data for Q, dO, and V to shared memory.
-    gmem_q.commit(gemm_q_k.smem_q);
-    gmem_do.commit(smem_do);
-    // if (Is_first) {
-    // if (true) {
-    if (Is_first || mask_val % 2 == 1) {
-        dot_do_o(dp_sum_regs, gmem_do.fetch_, gmem_o.fetch_, smem_dp_sum, 0);
-        const int dp_sum_row = tidx / Smem_dp_sum::THREADS_PER_ROW;
-        if ((dp_sum_row < Smem_dp_sum::ROWS) && (tidx % Smem_dp_sum::THREADS_PER_ROW == 0)) {
-            gmem_softmax_d.store_row(reinterpret_cast<uint32_t(&)[Gmem_tile_do::LDGS]>(dp_sum_regs), dp_sum_row);
-        }
-    }
-
-    // Instead of scaling dP by rp_dropout, we scale V instead
-    if (Is_dropout) {
-        const uint32_t scale_dropout = params.scale_dropout;
-        #pragma unroll
-        for(int it=0; it < Gmem_tile_v::LDGS; it++){
-            gmem_v.fetch_[it] = fmha::hmul8(scale_dropout, gmem_v.fetch_[it]);
-        }
-    }
-
-    gmem_v.commit(smem_v);
-
-    // const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
-    // #pragma unroll
-    // for(int it=0; it < Gmem_tile_k::LDGS; it++){
-    //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
-    // }
-
-    // Commit the data for K to shared memory.
-    if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        gmem_k.commit(gemm_q_k.smem_k);
-    }
-
-    __syncthreads();
-
-    // Load the fragments for Q.
-    gemm_q_k.load_q();
-
-    // Load the fragments for V. We keep the data in registers during the entire kernel.
-    typename Smem_tile_v::Fragment frag_v[Kernel_traits::V_IN_REGS ? Mma_tile_p::MMAS_K : 2][Mma_tile_p::MMAS_N];
-    if (Kernel_traits::V_IN_REGS) {
-        #pragma unroll
-        for( int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            smem_v.load(frag_v[ki], ki);
-        }
-    }
-
-    // Commit the data for V to shared memory if it has not been done already.
-    if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        // Make sure we are done loading the fragments for K.
-        __syncthreads();
-
-        // Commit the data to shared memory for V.
-        gmem_k.commit(gemm_q_k.smem_k);
-
-        // Make sure the data is in shared memory.
-        __syncthreads();
-    }
-
-    // Load the fragments for K.
-    gemm_q_k.load_k();
-    // Load the fragments for K^T.
-    // typename Smem_tile_kt::Fragment frag_kt[2][Mma_tile_dq::MMAS_N];
-    // smem_kt.load(frag_kt[0], 0);
-    // typename Smem_tile_kt::Fragment frag_kt[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_N];
-    // #pragma unroll
-    // for( int ki = 0; ki < Mma_tile_dq::MMAS_K; ++ki ) {
-    //     smem_kt.load(frag_kt[ki], ki);
-    // }
-
-    // Create the object to do the softmax.
-    // We won't be using the shared memory for this softmax at all
-    Softmax softmax(params, smem_, tidx);
-
-    // Declare the accumulators for the 3rd gemm.
-    fmha::Fragment_accumulator acc_dv[Mma_tile_dkv::MMAS_M][Mma_tile_dkv::MMAS_N];
-    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dkv::WARPS_K>::apply(acc_dv);
-    fmha::Fragment_accumulator acc_dk[Mma_tile_dkv::MMAS_M][Mma_tile_dkv::MMAS_N];
-    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dkv::WARPS_K>::apply(acc_dk);
-
-    // Load over the entire sequence length.
-    for( int l = 0; l < steps; l++ ) {
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-        //     printf("block_row_idx = %d\n", block_row_idx);
-        // }
-        if (block_row_idx * Cta_tile_p::M >= binfo.actual_seqlen_q) break;
-
-        int mask_val_next = l < steps - 1 ? blockmask.mask_val(l + 1) : -1;
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-        //     printf("mask_val = %d, mask_val_next = %d\n", mask_val, mask_val_next);
-        // }
-
-        // Load the fragments for V.
-        // typename Smem_tile_v::Fragment frag_v[2][Mma_tile_p::MMAS_N];
-        if (!Kernel_traits::V_IN_REGS) { smem_v.load(frag_v[0], 0); }
-
-        // Load the fragments for dO.
-        typename Smem_tile_do::Fragment frag_do[2][Mma_tile_p::MMAS_M];
-        smem_do.load(frag_do[0], 0);
-
-        // Declare the accumulators for the 1st gemm.
-        fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
-
-        // Do this part of P^T = (Q * K^T)^T.
-        gemm_q_k(acc_p);
-
-        // Load the mask for that iteration.
-        mask.load(block_row_idx);
-
-        // Convert from the accumulator type to FP32 for Softmax.
-        softmax.unpack_noscale(acc_p);
-        // Apply the mask.
-        softmax.apply_mask(mask);
-        // Scale by log-sum-exp of the softmax
-        // softmax.apply_exp(p_lse);
-        softmax.template scale_apply_exp</*scale_max=*/false>(p_lse, params.scale_bmm1f);
-        if (Is_dropout) {
-            // softmax.apply_dropout(ph, params.p_dropout_in_uint);
-            // softmax.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint);
-            softmax.template apply_dropout_16bits</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint16_t);
-        }
-
-        using Frag_p = fmha::Fragment_a<fmha::Row>;
-        Frag_p frag_p[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_M];
-        static_assert(Mma_tile_dq::MMAS_M == Mma_tile_p::MMAS_M);
-        static_assert(Mma_tile_dq::MMAS_K == Mma_tile_p::MMAS_N);
-        softmax.template pack<__half>(frag_p);
-
-        // Store s * dmask to smem for transpose
-        smem_s.store(frag_p);
-
-        // Trigger the load for the next Q values.
-        bool not_last_iter = (l < steps - 1) && (mask_val_next != -1);
-        block_row_idx_next = mask_val_next / 4;
-        int block_row_idx_to_move = block_row_idx_next - block_row_idx;
-        if (not_last_iter) {
-            gemm_q_k.smem_q.move_to_next_write_buffer();
-            gmem_q.move(block_row_idx_to_move);
-            gmem_q.load();
-        }
-
-        // if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0 ) {
-        //     // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
-        //     __syncthreads();
-        // }
-
-        bool is_first_read = Is_first || mask_val % 2 == 1;
-        // TD [2022-04-24]: if Is_first, then it's faster to set acc_dp to zero then subtract by
-        // dp_sum later. If !Is_first, then it's faster to set acc_dp to -dp_sum and don't subtract
-        // later. This is because loading dp_sum earlier uses more registers.
-        fmha::Fragment_accumulator acc_dp[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
-        // if (Is_first) {
-        // if (true) {
-        if (is_first_read) {
-            fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_dp);
-        } else {
-            #pragma unroll
-            for (int mi = 0; mi < Mma_tile_p::MMAS_M; ++mi) {
-                #pragma unroll
-                for (int ni = 0; ni < Mma_tile_p::MMAS_N; ++ni) {
-                    #pragma unroll
-                    for (int ii = 0; ii < 8; ++ii) {
-                        acc_dp[mi][ni].elt(ii) = -dp_sum[mi * 2 + ((ii / 2) % 2)];
-                    }
-                }
-            }
-        }
-
-        // Do this part of dP^T = (dO * V^T)^T.
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of dO values.
-            smem_do.load(frag_do[ki & 1], ki);
-            if (!Kernel_traits::V_IN_REGS) {
-                smem_v.load(frag_v[ki & 1], ki);
-                fmha::gemm_cl<__half>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1) & 1]);
-            } else {
-                fmha::gemm_cl<__half>(acc_dp, frag_do[(ki - 1) & 1], frag_v[ki - 1]);
-            }
-            // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l < 4))  {
-            //     float2 tmp = __half22float2(reinterpret_cast<__half2 &>(frag_do[(ki - 1) & 1]));
-            //     printf("frag_do=%.6f, %.6f\n", tmp.x, tmp.y);
-            //     tmp = __half22float2(reinterpret_cast<__half2 &>(frag_v[(ki - 1) & 1]));
-            //     printf("frag_v=%.6f, %.6f\n", tmp.x, tmp.y);
-            // }
-        }
-
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_p::MMAS_K;
-            if (!Kernel_traits::V_IN_REGS) {
-                fmha::gemm_cl<__half>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1) & 1]);
-            } else {
-                fmha::gemm_cl<__half>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1)]);
-            }
-        }
-
-        // Load the fragments for K^T.
-        typename Smem_tile_kt::Fragment frag_kt[2][Mma_tile_dq::MMAS_N];
-        smem_kt.load(frag_kt[0], 0);
-
-        // if (Is_first) {
-        // if (true) {
-        if (is_first_read) {
-            const int quad = (tidx % Cta_tile_p::THREADS_PER_WARP) / 4;
-            const int row[2] = {quad, quad + 8};
-            smem_dp_sum.load(dp_sum, row, l % 2);
-        }
-
-        // Trigger the load for the next dO values.
-        if (not_last_iter) {
-            smem_do.move_to_next_write_buffer();
-            gmem_do.move(block_row_idx_to_move);
-            gmem_do.load();
-            gmem_o.move(block_row_idx_to_move);
-            // if (Is_first) {
-            // if (true) {
-            if (Is_first || mask_val_next % 2 == 1) {
-                gmem_o.load();
-            }
-        }
-
-        softmax.unpack_noscale(acc_dp);
-        // // TD [2022-04-01]: Don't need to apply mask since the corresponding value in softmax
-        // // will be zero.
-        // for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { dp_sum[mi] *= params.p_dropout; }
-        // if (Is_first) { softmax.subtract_dp_sum(dp_sum); }
-        // if (true) { softmax.subtract_dp_sum(dp_sum); }
-        if (is_first_read) { softmax.subtract_dp_sum(dp_sum); }
-
-        Frag_p frag_dp[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_M];
-        softmax.template pack<__half>(frag_dp);
-
-        if (!Is_dropout) {
-            #pragma unroll
-            for( int mi = 0; mi < Mma_tile_p::MMAS_M; mi++ ) {
-                #pragma unroll
-                for( int ni = 0; ni < Mma_tile_p::MMAS_N; ni++ ) {
-                    frag_p[mi][ni].hmul(frag_dp[mi][ni]);
-                }
-            }
-        } else {
-            __half2 dp_sum_half[Mma_tile_p::MMAS_M * 2];
-            for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
-                dp_sum_half[mi] = __float2half2_rn(dp_sum[mi]);
-            }
-            const __half zero_h = __half(0.f);
-            #pragma unroll
-            for( int mi = 0; mi < Mma_tile_p::MMAS_M; mi++ ) {
-                #pragma unroll
-                for( int ni = 0; ni < Mma_tile_p::MMAS_N; ni++ ) {
-                    #pragma unroll
-                    for (int ii = 0; ii < 4; ++ii) {
-                        const __half2 p = frag_p[mi][ni].template elt_as<__half2>(ii);
-                        const __half2 pdp = __hmul2(p, frag_dp[mi][ni].template elt_as<__half2>(ii));
-                        // If this element is dropped, then frag_p stores -p instead of p.
-                        // So pd holds -p * dp_sum in that case.
-                        const __half2 pd = __hmul2(p, dp_sum_half[mi * 2 + (ii % 2)]);
-                        const __half low = __low2half(p) >= zero_h ? __low2half(pdp) : __low2half(pd);
-                        const __half high = __high2half(p) >= zero_h ? __high2half(pdp) : __high2half(pd);
-                        frag_p[mi][ni].template elt_as<__half2>(ii) = __halves2half2(low, high);
-                    }
-                }
-            }
-        }
-
-        // Store dp to smem for transpose
-        smem_dp.store(frag_p);
-
-        // gmem_s.store(frag_p, mask);
-        // gmem_s.move();
-
-        // Declare the accumulators for the 2nd gemm.
-        fmha::Fragment_accumulator acc_dq[Mma_tile_dq::MMAS_M][Mma_tile_dq::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_dq::WARPS_K>::apply(acc_dq);
-
-        // Do this part of O = P^T * V^T.
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_dq::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            smem_kt.load(frag_kt[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<__half>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1) & 1]);
-            // fmha::gemm_cl<__half>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1)]);
-        }
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_dq::MMAS_K;
-            fmha::gemm_cl<__half>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1) & 1]);
-            // fmha::gemm_cl<__half>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1)]);
-        }
-
-        static_assert(Gmem_tile_dq::LOOPS == 1);
-
-        // Swizzle the elements and do the final reduction.
-        smem_dq.store(acc_dq, 0);
-
-        typename Smem_tile_dot::Fragment frag_dot[2][Mma_tile_dkv::MMAS_N];
-        static_assert(Smem_tile_dot::Fragment::NUM_REGS == 4);
-        static_assert(Mma_tile_dkv::MMAS_K == 1);
-        smem_dot.load(frag_dot[0], 0);
-
-        // Threads in a warp is communicating via shared memory (smem_s and smem_dp)
-        __syncwarp();
-        typename Smem_tile_st::Fragment frag_s[Mma_tile_dkv::MMAS_K][Mma_tile_dkv::MMAS_M];
-        smem_s.load(frag_s);
-
-        if (Is_dropout) {
-            #pragma unroll
-            for( int ki = 0; ki < Mma_tile_dkv::MMAS_K; ki++ ) {
-                #pragma unroll
-                for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
-                    frag_s[ki][mi].template hrelu_<__half>();
-                }
-            }
-        }
-
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_dkv::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            smem_dot.load(frag_dot[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<__half>(acc_dv, frag_s[(ki - 1)], frag_dot[(ki - 1) & 1]);
-        }
-
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_dkv::MMAS_K;
-            fmha::gemm_cl<__half>(acc_dv, frag_s[(ki - 1)], frag_dot[(ki - 1) & 1]);
-        }
-
-        // __syncthreads();
-        // Commit the values for Q and dO into shared memory.
-        if (not_last_iter) {
-            gmem_q.commit(gemm_q_k.smem_q);
-        }
-
-        uint4 dq_out[Gmem_tile_dq::STGS_PER_LOOP];
-        // if (!Is_first) { gmem_dq_tmp.load(dq_out, 0); }
-        if (!is_first_read) { gmem_dq_tmp.load(dq_out, 0); }
-
-        // __syncthreads();
-        // Commit the values for Q and dO into shared memory.
-        if (not_last_iter) {
-            gmem_do.commit(smem_do);
-            // if (Is_first) {
-            // if (true) {
-            gmem_softmax_d.move(block_row_idx_to_move);
-            if (Is_first || mask_val_next % 2 == 1) {
-                // dot_do_o(dp_sum_regs, gmem_do.fetch_, gmem_o.fetch_, smem_dp_sum);
-                // smem_dp_sum.move_to_next_write_buffer();
-                dot_do_o(dp_sum_regs, gmem_do.fetch_, gmem_o.fetch_, smem_dp_sum, (l + 1) % 2);
-                const int dp_sum_row_1 = tidx / Smem_dp_sum::THREADS_PER_ROW;
-                if ((dp_sum_row_1 < Smem_dp_sum::ROWS) && (tidx % Smem_dp_sum::THREADS_PER_ROW == 0)) {
-                    gmem_softmax_d.store_row(reinterpret_cast<uint32_t(&)[Gmem_tile_do::LDGS]>(dp_sum_regs), dp_sum_row_1);
-                }
-            }
-            gmem_softmax_lse.move(block_row_idx_to_move);
-            gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_lse));
-            // if (!Is_first) {
-            if (!(Is_first || mask_val_next % 2 == 1)) {
-                gmem_softmax_d.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(dp_sum));
-            }
-        }
-
-        typename Smem_tile_st::Fragment frag_dpt[Mma_tile_dkv::MMAS_K][Mma_tile_dkv::MMAS_M];
-        smem_dp.load(frag_dpt);
-
-        gemm_q_k.reload_k();
-
-        typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dkv::MMAS_N];
-        static_assert(Smem_tile_qt::Fragment::NUM_REGS == 4);
-        static_assert(Mma_tile_dkv::MMAS_K == 1);
-        smem_qt.load(frag_qt[0], 0);
-
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_dkv::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            smem_qt.load(frag_qt[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<__half>(acc_dk, frag_dpt[(ki - 1)], frag_qt[(ki - 1) & 1]);
-        }
-
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_dkv::MMAS_K;
-            fmha::gemm_cl<__half>(acc_dk, frag_dpt[(ki - 1)], frag_qt[(ki - 1) & 1]);
-        }
-
-        // Make sure dQ is in shared memory.
-        __syncthreads();
-
-        // Load from shared memory.
-        is_first_read ? smem_dq.template load</*zero_init=*/true>(dq_out) : smem_dq.template load</*zero_init=*/false>(dq_out);
-
-        const bool is_final_write =
-            Is_last
-            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
-            || ((mask_val & 0x2) != 0)
-            || ((Is_causal) && (block_row_idx * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
-        if (is_final_write) {
-            // if (Is_dropout) {
-            //     dq_out[0] = fmha::fmul4(dq_out[0], params.rp_dropout);
-            // }
-            dq_out[0] = fmha::fmul4(dq_out[0], params.scale_bmm1f);
-            // Output the values.
-            gmem_dq.template store<__half>(dq_out, 0);
-        } else  {
-            // Output the values.
-            gmem_dq_tmp.store(dq_out, 0);
-        }
-
-        // Move to the next part of the output.
-        gmem_dq.move(block_row_idx_to_move);
-        if (!(Is_first && Is_last)) { gmem_dq_tmp.move(block_row_idx_to_move); }
-
-        // // Make sure the data is in shared memory.
-        // __syncthreads();
-
-        // Commit the values for Q and dO into shared memory.
-        if (not_last_iter) {
-            gemm_q_k.smem_q.move_to_next_read_buffer();
-            gemm_q_k.reload_q();
-            smem_qt.move_to_next_read_buffer();
-            // smem_qt.load(frag_qt[0], 0);
-            smem_do.move_to_next_read_buffer();
-            smem_dot.move_to_next_read_buffer();
-            // smem_dot.load(frag_dot[0], 0);
-        }
-
-        if (mask_val_next == -1) break;
-        mask_val = mask_val_next;
-        block_row_idx += block_row_idx_to_move;
-
-    }  // Outer loop over the sequence length.
-
-    if (Is_dropout) {
-        for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
-            for( int ni = 0; ni < Mma_tile_dkv::MMAS_N; ni++ ) {
-                acc_dv[mi][ni].mul_(params.rp_dropout);
-            }
-        }
-    }
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
-    //     printf("l final, acc_dk=%.6f, %.6f\n", acc_dk[0][0].elt(0), acc_dk[0][0].elt(1));
-    // }
-    for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
-        for( int ni = 0; ni < Mma_tile_dkv::MMAS_N; ni++ ) {
-            // acc_dk[mi][ni].mul_(Is_dropout ? params.rp_dropout * params.scale_bmm1f : params.scale_bmm1f);
-            acc_dk[mi][ni].mul_(params.scale_bmm1f);
-        }
-    }
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
-    //     printf("l final, acc_dk=%.6f, %.6f\n", acc_dk[0][0].elt(0), acc_dk[0][0].elt(1));
-    // }
-
-    __syncthreads();
-    // TODO [TD - 2022-05-04]: Are there cases where the shared mem for dV and dK are larger than
-    // the total amount of shared mem?
-    // Epilogue swizzle for dV
-    Smem_tile_dv smem_dv(&smem_[0], tidx);
-    smem_dv.template store<__half>(acc_dv);
-
-    // Epilogue swizzle for dK
-    Smem_tile_dk smem_dk(&smem_[Smem_tile_dv::BYTES_PER_TILE], tidx);
-    smem_dk.template store<__half>(acc_dk);
-
-    __syncthreads();
-    uint4 dv_out[Smem_tile_dv::NUM_LDS];
-    smem_dv.load(dv_out);
-    Gmem_tile_dv gmem_dv(params.dv_ptr, params.dv_row_stride_in_elts, params.dv_head_stride_in_elts,
-                         params.d, binfo, tidx, false);
-    if (!Is_first) {
-        gmem_dv.move(loop_step_idx);
-    }
-    gmem_dv.store(dv_out);
-
-    uint4 dk_out[Smem_tile_dk::NUM_LDS];
-    smem_dk.load(dk_out);
-    // for (int ii = 0; ii < Smem_tile_dk::NUM_LDS; ++ii) {
-    //     dk_out[ii] = fmha::fmul4(dk_out[ii], params.scale_bmm1f);
-    // }
-    Gmem_tile_dk gmem_dk(params.dk_ptr, params.dk_row_stride_in_elts, params.dk_head_stride_in_elts,
-                         params.d, binfo, tidx, false);
-    if (!Is_first) {
-        gmem_dk.move(loop_step_idx);
-    }
-    gmem_dk.store(dk_out);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// loop_steps = -1 means the number of steps will be params.seqlen_k / Kernel_traits::Cta_tile_p::N.
-// This template parameter is there so we can specialize with loop_steps == 1 and loop_steps == 2.
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, int loop_steps=-1, typename Params>
-inline __device__ void compute_block_dq_dk_dv_1xN(const Params &params) {
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    const int tidx_global = (bidb * params.h + bidh) * blockDim.x + tidx;
-    auto seeds = at::cuda::philox::unpack(params.philox_args);
-    Philox ph(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
-
-    if (loop_steps == 1) {
-        compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
-    } else if (loop_steps == 2) {
-        compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, false>(params, ph, 0);
-        compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, true>(params, ph, 1);
-    } else {
-        if (params.seqlen_k == blocksize_c) {
-            compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
-        } else {
-            const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
-            compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, false>(params, ph, 0);
-            for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
-                compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, false>(params, ph, loop_step_idx);
-            }
-            compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, true>(params, ph, max_loop_steps - 1);
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace fmha
--- a/csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu
+++ b/csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include "fmha.h"
-#include "fmha_block_fprop_kernel_1xN.h"
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
-__global__ void fmha_block_fprop_fp16_sm80_loop_kernel(FMHA_fprop_params params) {
-    fmha::device_block_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
-}
-
-template<typename Kernel_traits>
-void run_fmha_block_fp16_sm80_loop_(Launch_params<FMHA_fprop_params> &launch_params,
-                            const bool configure) {
-    bool is_causal = launch_params.params.is_causal;
-    // TD [2022-04-27]: This case work is pretty ugly, maybe there's a better way?
-    auto kernel = launch_params.is_dropout
-        ? (is_causal
-           ? (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, true, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, true, false>)
-           : (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, false, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, false, false>))
-        : (is_causal
-           ? (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, true, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, true, false>)
-           : (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, false, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, false, false>));
-
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-    const int loop_steps = (launch_params.params.seqlen_k + blocksize_c - 1) / blocksize_c;
-    constexpr int smem_size_softmax_lse = Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
-    // Don't need smem_size_softmax_lse if we're not looping
-    const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
-        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
-
-    if( smem_size >= 48 * 1024 ) {
-        FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    }
-
-    if (configure) {
-        using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
-        constexpr int M = Kernel_traits::Cta_tile_p::M;
-        size_t STEPS = (launch_params.params.seqlen_q + M - 1) / M;
-        constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
-        constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
-        size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
-        launch_params.elts_per_thread = elts_per_head;
-        return;
-    }
-
-    dim3 grid(launch_params.params.b, launch_params.params.h);
-    kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
-        launch_params.params);
-
-    FMHA_CHECK_CUDA(cudaPeekAtLastError());
-}
-
-void run_fmha_block_fp16_sm80(Launch_params<FMHA_fprop_params> &launch_params,
-                             const bool configure) {
-    if (launch_params.params.d == 16) {
-        using Kernel_traits = FMHA_kernel_traits<256, 16, 16, 1, 4, 0x08u>;
-        run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
-    } else if (launch_params.params.d == 32) {
-        using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u>;
-        run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
-    } else if (launch_params.params.d == 64) {
-        using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u>;
-        run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
-    }
-}
\ No newline at end of file
--- a/csrc/flash_attn/src/fmha_block_fprop_kernel_1xN.h
+++ b/csrc/flash_attn/src/fmha_block_fprop_kernel_1xN.h
-/***************************************************************************************************
- * Copyright (c) 2022, Tri Dao.
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "fmha_fprop_kernel_1xN.h"
-#include "fmha_kernel.h"
-#include "fmha_blockmask.h"
-#include <fmha/kernel_traits.h>
-#include <fmha/gemm.h>
-
-namespace fmha {
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, bool Is_first, bool Is_last, typename Params, typename Prng>
-inline __device__ void device_block_1xN_(const Params &params, const int bidb, const int bidh, int steps, Prng &ph0, Prng &ph1, const int loop_step_idx) {
-
-
-    // The description of the CTA tile for the 1st batched GEMM.
-    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
-    // The description of the CTA tile for the 2nd batched GEMM.
-    using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
-
-    // The MMA tile for the 1st GEMM.
-    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
-    // The MMA tile for the 2nd GEMM.
-    using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
-
-    // The global memory tile to load Q.
-    using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
-
-    // The global memory tile to load K.
-    using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
-
-    // The global memory tile to load V.
-    using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
-    // The shared memory tile to swizzle V.
-    using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
-
-    // The global memory tile to store O.
-    using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
-    using Gmem_tile_o_tmp = fmha::Gmem_tile_o<Cta_tile_o, 4>;
-    // The shared memory tile to swizzle O.
-    using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
-
-    using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
-
-    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
-
-    using Smem_softmax_sum = typename Kernel_traits::Smem_dp_sum;
-
-    using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
-
-    using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
-    // if( binfo.stop_early() ) return;
-    if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
-
-    Blockmask blockmask(params, loop_step_idx);
-    int block_row_idx = 0;
-    int mask_val = blockmask.mask_val(0);
-    if (mask_val == -1) return;
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("mask_val = %d.\n", mask_val);
-    // }
-
-    Gemm1 gemm_q_k(smem_, tidx);
-    // Allocate the global memory tile loader for Q.
-    Gmem_tile_q gmem_q(params.q_ptr, params.q_row_stride_in_elts, params.q_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-    // Allocate the global memory tile loader for O.
-    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                       params.d, binfo, tidx);
-    Gmem_tile_o_tmp gmem_o_tmp(params.o_tmp_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                               params.d, binfo, tidx);
-    // Allocate the global memory tile loader for S.
-    Gmem_tile_s gmem_s(params, binfo, tidx);
-    Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
-
-    // Wind gmem tiles to the correct position.
-    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int block_row_idx_next = mask_val / 4;
-    int block_row_idx_to_move = block_row_idx_next - block_row_idx;
-    gmem_q.move(block_row_idx_to_move);
-    gmem_o.move(block_row_idx_to_move);
-    gmem_o_tmp.move(block_row_idx_to_move);
-    if (Return_softmax) { gmem_s.move(block_row_idx_to_move); }
-    gmem_softmax_lse.move(block_row_idx_to_move);
-    block_row_idx = block_row_idx_next;
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("begin = %d, steps = %d\n", begin, steps);
-    // }
-
-    fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
-
-    // Allocate the global memory tile loader for K.
-    Gmem_tile_k gmem_k(params.k_ptr, params.k_row_stride_in_elts, params.k_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // Allocate the global memory tile loader for V.
-    Gmem_tile_v gmem_v(params.v_ptr, params.v_row_stride_in_elts, params.v_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // The base pointer of smem_v;
-    char *smem_v_ = &smem_[Gemm1::SMEM_OFFSET_V];
-
-    // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
-    Smem_tile_v smem_v(smem_v_, tidx);
-
-    // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
-    Smem_tile_o smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
-
-    if (!Is_first) {
-        gmem_k.move(loop_step_idx);
-        gmem_v.move(loop_step_idx);
-        if (Return_softmax) { gmem_s.move(loop_step_idx * steps); }
-    }
-
-    // Trigger the loads for K.
-    gmem_k.load();
-    // Trigger the loads for Q.
-    gmem_q.load();
-    // Trigger the loads for V.
-    gmem_v.load();
-
-    if (!Is_first) { __syncthreads(); }
-
-    float p_prev_lse[Mma_tile_p::MMAS_M * 2];
-    if (!(Is_first || mask_val % 2 == 1)) {
-        gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
-    }
-
-    // Commit the data for Q and V to shared memory.
-    gmem_q.commit(gemm_q_k.smem_q);
-    gmem_v.commit(smem_v);
-
-    // const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
-    // #pragma unroll
-    // for(int it=0;it < Gmem_tile_k::LDGS;it++){
-    //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
-    // }
-
-    // Commit the data for K to shared memory.
-    if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        gmem_k.commit(gemm_q_k.smem_k);
-    }
-
-    __syncthreads();
-
-    // Load the fragments for Q.
-    gemm_q_k.load_q();
-
-    // Load the fragments for V. We keep the data in registers during the entire kernel.
-    typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
-    #pragma unroll
-    for( int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki ) {
-        smem_v.load(frag_v[ki], ki);
-    }
-
-    // Commit the data for V to shared memory if it has not been done already.
-    if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        // Make sure we are done loading the fragments for K.
-        __syncthreads();
-
-        // Commit the data to shared memory for V.
-        gmem_k.commit(gemm_q_k.smem_k);
-
-        // Make sure the data is in shared memory.
-        __syncthreads();
-    }
-
-    // Load the fragments for K.
-    gemm_q_k.load_k();
-
-    // Create the object to do the softmax.
-    Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
-
-    Smem_softmax_sum smem_softmax_lse(reinterpret_cast<float *>(&smem_[Gemm1::SMEM_BYTES]), tidx);
-
-    // Load over the entire sequence length.
-    for( int l = 0; l < steps; l++ ) {
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-        //     printf("block_row_idx = %d\n", block_row_idx);
-        // }
-        if (block_row_idx * Cta_tile_p::M >= binfo.actual_seqlen_q) break;
-
-        int mask_val_next = l < steps - 1 ? blockmask.mask_val(l + 1) : -1;
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-        //     printf("mask_val = %d, mask_val_next = %d\n", mask_val, mask_val_next);
-        // }
-
-        // Declare the accumulators for the 1st gemm.
-        fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
-
-        // Do this part of P = Q * K^T.
-        gemm_q_k(acc_p);
-
-        uint4 out[Gmem_tile_o::STGS_PER_LOOP];
-        bool is_first_read = Is_first || mask_val % 2 == 1;
-        // if (!Is_first) { gmem_o_tmp.load(out, 0); }
-        if (!is_first_read) { gmem_o_tmp.load(out, 0); }
-
-        // Trigger the load for the next Q values.
-        bool not_last_iter = (l < steps - 1) && (mask_val_next != -1);
-        block_row_idx_next = mask_val_next / 4;
-        int block_row_idx_to_move = block_row_idx_next - block_row_idx;
-        if (not_last_iter) {
-            gemm_q_k.smem_q.move_to_next_write_buffer();
-            gmem_q.move(block_row_idx_to_move);
-            gmem_q.load();
-        }
-
-        // Load the mask for that iteration.
-        mask.load(block_row_idx);
-
-        // Convert from the accumulator type to FP32 for Softmax.
-        softmax.unpack_noscale(acc_p);
-
-        // Apply the mask.
-        softmax.apply_mask(mask);
-
-        // softmax.unpack_noscale_half_and_apply_mask(acc_p, mask);
-
-        if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0 ) {
-            // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
-            __syncthreads();
-        }
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("p_prev_lse=%.6f, %.6f\n", p_prev_lse[0], p_prev_lse[1]);
-        //     }
-        // }
-        // Compute the max.
-        float p_max[Mma_tile_p::MMAS_M * 2];
-        // if (!Is_first) {
-        if (!is_first_read) {
-            smem_softmax_lse.store_pair(p_prev_lse, l % 2);
-            // for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi]; }
-            for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi] / params.scale_bmm1f; }
-        }
-
-        // Trigger the load for the next LSE values.
-        if (not_last_iter) {
-            // if (!Is_first) {
-            if (!(Is_first || mask_val_next % 2 == 1)) {
-                gmem_softmax_lse.load_next(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
-                                           block_row_idx_to_move);
-            }
-        }
-
-        // __half2 p_max[Mma_tile_p::MMAS_M];
-        // softmax.template reduce_max</*zero_init=*/Is_first>(p_max);
-        is_first_read ? softmax.template reduce_max</*zero_init=*/true>(p_max) : softmax.template reduce_max</*zero_init=*/false>(p_max);
-
-        // if ((threadIdx.x == 0) && (l == 38)) {
-        //     printf("loop_step_idx %d, p_max = %.6f, %.6f., p_prev_lse = %.6f, %.6f\n", loop_step_idx, p_max[0], p_max[1], Is_first ? -10000.f : p_prev_lse[0], Is_first ? -10000.f : p_prev_lse[1]);
-        // }
-
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("after reduce_max=%.6f, %.6f\n", softmax.elt_[0][0], softmax.elt_[0][1]);
-        //     }
-        // }
-
-        // Compute the exponential value.
-        // softmax.apply_exp(p_max);
-        softmax.scale_apply_exp(p_max, params.scale_bmm1f);
-
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("after apply_exp=%.6f, %.6f\n", softmax.elt_[0][0], softmax.elt_[0][1]);
-        //     }
-        // }
-
-        // Compute the sum.
-        float p_sum[Mma_tile_p::MMAS_M * 2];
-        // if (!Is_first) {
-        //     int warp = tidx / Cta_tile_p::THREADS_PER_WARP;
-        //     int lane = tidx % Cta_tile_p::THREADS_PER_WARP;
-        //     for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
-        //         p_sum[mi] = ((warp == 0) && (lane % 4 == 0)) ? expf(p_prev_lse[mi] - p_max[mi]) : 0;
-        //     }
-        // }
-        // softmax.reduce_sum(p_sum);
-        softmax.reduce_sum_before_sync_(p_sum);
-        // softmax.template reduce_sum_before_sync_</*zero_init=*/Is_first>(p_sum);
-
-        // float p_sum_log[Mma_tile_p::MMAS_M * 2];
-        // for (int mi = 0; mi  < Mma_tile_p::MMAS_M * 2; ++mi) {
-        //     float sum = p_sum[mi];
-        //     // p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi] + __logf(sum);
-        //     constexpr float kLog2e = M_LOG2E;
-        //     p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi] * kLog2e + __log2f(sum);
-        // }
-        // // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_sum));
-        // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_sum_log));
-        // gmem_softmax_lse.move();
-
-        // // Finalize softmax on the accumulators of P^T.
-        // softmax.scale(p_sum);
-
-        constexpr bool encode_dropout_in_sign_bit = Return_softmax;
-        if (Is_dropout) {
-            // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph0, params.p_dropout_in_uint);
-            // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph0, ph1, params.p_dropout_in_uint);
-            softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph0, ph1, params.p_dropout_in_uint16_t);
-        }
-
-        using Frag_p = fmha::Fragment_a<fmha::Row>;
-        Frag_p frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
-        static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M);
-        static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N);
-        softmax.template pack<__half>(frag_p);
-        if (Return_softmax) {
-            gmem_s.store(frag_p, mask);
-            if (not_last_iter) {
-                gmem_s.move(block_row_idx_to_move);
-            }
-        }
-
-        // Commit the values for Q into shared memory.
-        if (not_last_iter) {
-            gmem_q.commit(gemm_q_k.smem_q);
-        }
-
-        if (Is_dropout && encode_dropout_in_sign_bit) {
-            #pragma unroll
-            for( int ki = 0; ki < Mma_tile_o::MMAS_K; ki++ ) {
-                #pragma unroll
-                for( int mi = 0; mi < Mma_tile_o::MMAS_M; mi++ ) {
-                    frag_p[ki][mi].template hrelu_<__half>();
-                }
-            }
-        }
-
-        // Declare the accumulators for the 2nd gemm.
-        fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);
-
-        // Do this part of O = P^T * V^T.
-        #pragma unroll
-        for( int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki ) {
-            fmha::gemm_cl<__half>(acc_o, frag_p[ki], frag_v[ki]);
-        }
-
-        // The mapping from tidx to rows changes between the softmax and the O-reduction.
-        // So we recalculate the max.
-        float p_max_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
-        int rows[Gmem_tile_o::STGS_PER_LOOP];
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            rows[jj] = tidx / Gmem_tile_o::THREADS_PER_ROW + jj * Gmem_tile_o::ROWS_PER_STG;
-        }
-        softmax.reduce_max_after_sync_(p_max_o, rows);
-        static_assert(Mma_tile_o::MMAS_M == 1);
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            p_max_o[jj][0] *= params.scale_bmm1f;
-        }
-        float p_prev_scale_o[Gmem_tile_o::STGS_PER_LOOP];
-        // if (!Is_first) { smem_softmax_lse.load(p_prev_scale_o, rows, l % 2); }
-        if (!is_first_read) { smem_softmax_lse.load(p_prev_scale_o, rows, l % 2); }
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("p_prev_scale_o=%.6f\n", p_prev_scale_o[0]);
-        //     }
-        // }
-
-        static_assert(Gmem_tile_o::LOOPS == 1);
-
-        // Swizzle the elements and do the final reduction.
-        smem_o.store(acc_o, 0);
-
-        // Make sure the data is in shared memory.
-        __syncthreads();
-
-        static_assert(Mma_tile_o::MMAS_M == 1);
-        float p_sum_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
-        softmax.reduce_sum_after_sync_(p_sum_o, rows);
-        // if (!Is_first) {
-        if (!is_first_read) {
-            for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-                p_prev_scale_o[jj] = expf(p_prev_scale_o[jj] - p_max_o[jj][0]);
-                p_sum_o[jj][0] += p_prev_scale_o[jj];
-            }
-        }
-
-        float p_sum_log[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
-        #pragma unroll
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            float sum = p_sum_o[jj][0];
-            p_sum_log[jj][0] = (sum == 0.f || sum != sum) ? -INFINITY : p_max_o[jj][0] + __logf(sum);
-            // if (sum == 0.f || sum != sum) {
-            //     printf("loop_step_idx = %d, l = %d, tidx = %d, sum = %.6f, p_max_o = %.6f\n", loop_step_idx, l, tidx, sum, p_max_o[jj][0]);
-            // }
-            // if (Is_first) {
-            //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-            //         printf("p_sum_log=%.6f\n", p_sum_log[jj][0]);
-            //     }
-            // }
-            if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) && (tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
-                gmem_softmax_lse.store_row(
-                    reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]), rows[jj]);
-            }
-        }
-        if (not_last_iter) {
-            gmem_softmax_lse.move(block_row_idx_to_move);
-        }
-
-        // Load from shared memory.
-        // if (!Is_first) {
-        if (!is_first_read) {
-            for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-                out[jj] = fmha::fmul4(out[jj], p_prev_scale_o[jj]);
-            }
-        }
-        // smem_o.template load</*zero_init=*/Is_first>(out);
-        is_first_read ? smem_o.template load</*zero_init=*/true>(out) : smem_o.template load</*zero_init=*/false>(out);
-
-        const bool is_final_write =
-            Is_last
-            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
-            || ((mask_val & 0x2) != 0)
-            || ((Is_causal) && (block_row_idx * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-        //     printf("is_final_write = %d\n", is_final_write);
-        // }
-        #pragma unroll
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            float sum = p_sum_o[jj][0];
-            float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-            if (Is_dropout && is_final_write) {
-                inv_sum *= params.rp_dropout;
-            }
-            out[jj] = fmha::fmul4(out[jj], inv_sum);
-        }
-
-        // if (Is_dropout && Is_last) {
-        //     for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-        //         out[jj] = fmha::fmul4(out[jj], params.rp_dropout);
-        //     }
-        // }
-
-        // Output the values.
-        if (is_final_write) {
-            gmem_o.template store<__half>(out, 0);
-        } else {
-            gmem_o_tmp.store(out, 0);
-        }
-
-        // Move to the next part of the output.
-        gmem_o.move(block_row_idx_to_move);
-        if (!(Is_first && Is_last)) { gmem_o_tmp.move(block_row_idx_to_move); }
-        gemm_q_k.reload_k();
-
-        // Make sure we are reading from the correct buffer.
-        gemm_q_k.smem_q.move_to_next_read_buffer();
-        // Trigger the load from shared memory for the next series of Q values.
-        if (not_last_iter) {
-            gemm_q_k.reload_q();
-        }
-
-        if (mask_val_next == -1) break;
-        mask_val = mask_val_next;
-        block_row_idx += block_row_idx_to_move;
-
-    }  // Outer loop over the sequence length.
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, typename Params>
-inline __device__ void device_block_1xN_loop(const Params &params) {
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    const int tidx_global = (bidb * params.h + bidh) * blockDim.x * 2 + tidx;
-    auto seeds = at::cuda::philox::unpack(params.philox_args);
-    Philox ph0(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
-    Philox ph1(std::get<0>(seeds), tidx_global + blockDim.x, std::get<1>(seeds));
-    constexpr int M = Kernel_traits::Cta_tile_p::M;
-    const int STEPS = (params.seqlen_q + M - 1) / M;
-
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-    if (params.seqlen_k == blocksize_c) {
-        fmha::device_block_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, true>(params, bidb, bidh, STEPS, ph0, ph1, 0);
-    } else {
-        const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
-        fmha::device_block_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, false>(params, bidb, bidh, STEPS, ph0, ph1, 0);
-        for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
-            fmha::device_block_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, false>(params, bidb, bidh, STEPS, ph0, ph1, loop_step_idx);
-        }
-        fmha::device_block_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, true>(params, bidb, bidh, STEPS, ph0, ph1, max_loop_steps - 1);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace fmha
-
--- a/csrc/flash_attn/src/fmha_blockmask.h
+++ b/csrc/flash_attn/src/fmha_blockmask.h
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <fmha.h>
-#include <fmha/utils.h>
-#include <fmha/smem_tile.h>
-#include <fmha/gmem_tile.h>
-#include <fmha/mask.h>
-#include <fmha/softmax.h>
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct Blockmask {
-
-    template<typename Params>
-    __device__ Blockmask(const Params &params, int loop_step_idx) :
-        blockmask_ptr(params.blockmask + loop_step_idx * params.seqlen_q / 16) {
-    }
-
-    __device__ int mask_val(int block_row_idx) const {
-        return blockmask_ptr[block_row_idx];
-    }
-
-    const int *blockmask_ptr;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace fmha
--- a/csrc/flash_attn/src/fmha_bwd_hdim128.cu
+++ b/csrc/flash_attn/src/fmha_bwd_hdim128.cu
-// Copyright (c) 2022, Tri Dao.
-
-// Splitting the different head dimensions to different files to speed up compilation.
-
-#include "fmha_bwd_launch_template.h"
-
-void run_fmha_bwd_hdim128(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
-    FP16_SWITCH(params.is_bf16, ([&] {
-        using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 8, 0x100u, elem_type>;
-        run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-    }));
-}
\ No newline at end of file
--- a/csrc/flash_attn/src/fmha_bwd_hdim32.cu
+++ b/csrc/flash_attn/src/fmha_bwd_hdim32.cu
-// Copyright (c) 2022, Tri Dao.
-
-// Splitting the different head dimensions to different files to speed up compilation.
-
-#include "fmha_bwd_launch_template.h"
-
-void run_fmha_bwd_hdim32(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
-    FP16_SWITCH(params.is_bf16, ([&] {
-        if (params.seqlen_k == 128) {
-            using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 8, 0x08u, elem_type>;
-            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-        } else if (params.seqlen_k >= 256) {
-            using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 8, 0x08u, elem_type>;
-            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-        }
-    }));
-}
\ No newline at end of file
--- a/csrc/flash_attn/src/fmha_bwd_hdim64.cu
+++ b/csrc/flash_attn/src/fmha_bwd_hdim64.cu
-// Copyright (c) 2022, Tri Dao.
-
-// Splitting the different head dimensions to different files to speed up compilation.
-
-#include "fmha_bwd_launch_template.h"
-
-void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
-    FP16_SWITCH(params.is_bf16, ([&] {
-        auto dprops = at::cuda::getCurrentDeviceProperties();
-        if (params.seqlen_k == 128) {
-            using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>;
-            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-        } else if (params.seqlen_k >= 256) {
-            if ((dprops->major == 8 && dprops->minor == 0) || (dprops->major == 9 && dprops->minor == 0)) {
-                // Don't share smem for K & V, and don't keep V in registers
-                // This speeds things up by 2-3% by avoiding register spills, but it
-                // uses more shared memory, which is fine on A100 and H100 but not other GPUs.
-                // For other GPUs, we keep V in registers.
-                using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x100u, elem_type>;
-                run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-            } else if (dprops->major == 8 && dprops->minor > 0) {
-                using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x08u, elem_type>;
-                run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-            } else if (dprops->major == 7 && dprops->minor == 5) {
-                using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>;
-                run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
-            }
-        }
-    }));
-}
\ No newline at end of file
--- a/csrc/flash_attn/src/fmha_bwd_launch_template.h
+++ b/csrc/flash_attn/src/fmha_bwd_launch_template.h
-// Copyright (c) 2022, Tri Dao.
-
-#pragma once
-
-#include "static_switch.h"
-#include "fmha.h"
-#include "fmha_dgrad_kernel_1xN_loop.h"
-
-// Pick whether we should parallelize across seqlen_k (num_splits > 1) or not (num_splits=1).
-// Parallelizing will have better occupancy, but has some overhead due to having to zero out
-// dq_tmp and having to copy dq_tmp to dq.
-inline int num_splits_heuristic_bwd(int batch_nheads, int num_SMs, int ctas_per_sm, int seqlen,
-                             int blocksize, bool is_causal) {
-    float n_waves_1 = float(batch_nheads) / (num_SMs * ctas_per_sm);
-    float eff_1 = n_waves_1 / ceil(n_waves_1);
-    int num_splits_parallel = seqlen / blocksize;
-    float n_waves_parallel = float(batch_nheads * num_splits_parallel) / (num_SMs * ctas_per_sm);
-    float eff_parallel_raw = n_waves_parallel / ceil(n_waves_parallel);
-    float discount_factor;
-    if (!is_causal) {
-        discount_factor = 1.f + float(blocksize) / seqlen;
-    } else {  // For causal, parallelizing seems to help with load-balancing as well
-        // For example, if headdim=128, seqlen >= 1280 always prefers parallel
-        if (seqlen / blocksize >= 10) return num_splits_parallel;
-        discount_factor = 1.f + 0.5 * float(blocksize) / seqlen;
-    }
-    float eff_parallel = eff_parallel_raw / discount_factor;
-    return eff_1 >= eff_parallel ? 1 : num_splits_parallel;
-}
-
-template<typename Kernel_traits>
-__global__ void fmha_bwd_dot_do_o_kernel(FMHA_dgrad_params params) {
-    fmha::compute_dot_do_o<Kernel_traits>(params);
-}
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, int loop_steps=-1>
-__global__ void fmha_bwd_dq_dk_dv_loop_kernel(FMHA_dgrad_params params) {
-    fmha::compute_dq_dk_dv_1xN<Kernel_traits, Is_dropout, Is_causal, loop_steps>(params);
-}
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
-__global__ void fmha_bwd_q_dk_dv_loop_seqparallel_kernel(FMHA_dgrad_params params) {
-    fmha::compute_dq_dk_dv_seqparallel<Kernel_traits, Is_dropout, Is_causal>(params);
-}
-
-template<typename Kernel_traits>
-void run_fmha_bwd_loop(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
-    constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
-    constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
-    constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
-    constexpr int smem_size_dq = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
-
-    using Smem_tile_s = fmha::Smem_tile_mma_transposed<typename Kernel_traits::Cta_tile_p>;
-    constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
-    static_assert(smem_size_s == 16 * Kernel_traits::Cta_tile_p::N * 2);
-    static_assert(smem_size_dq == 16 * Kernel_traits::Cta_tile_p::K * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
-
-    constexpr int smem_size_dq_dk_dv = smem_size_q * 2 + smem_size_v * (Kernel_traits::V_IN_REGS ? 1 : 2) + smem_size_dq + smem_size_s * 2;
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-    // printf("blocksize_c = %d, WARPS_N = %d, Smem size = %d\n", blocksize_c, Kernel_traits::Cta_tile_p::WARPS_N, smem_size_dq_dk_dv);
-
-    bool is_dropout = params.p_dropout < 1.f;  // params.p_dropout is the probability of "keeping"
-    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
-    BOOL_SWITCH(is_dropout, IsDropoutConst, ([&] {
-        auto kernel = params.is_causal
-            ? &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, true>
-            : &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, false>;
-        if (params.seqlen_k == blocksize_c) {
-            kernel = params.is_causal
-                ? &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, true, /*loop_steps=*/1>
-                : &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, false, /*loop_steps=*/1>;
-        } else if (params.seqlen_k == blocksize_c * 2) {
-            kernel = params.is_causal
-                ? &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, true, /*loop_steps=*/2>
-                : &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, false, /*loop_steps=*/2>;
-        }
-        auto kernel_seqparallel = params.is_causal
-            ? &fmha_bwd_q_dk_dv_loop_seqparallel_kernel<Kernel_traits, IsDropoutConst, true>
-            : &fmha_bwd_q_dk_dv_loop_seqparallel_kernel<Kernel_traits, IsDropoutConst, false>;
-        if( smem_size_dq_dk_dv >= 48 * 1024 ) {
-            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
-                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
-            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
-                kernel_seqparallel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
-        }
-        // Automatically set num_splits to maximize occupancy
-        if (params.num_splits <= 0) {
-            int ctas_per_sm;
-            cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                &ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size_dq_dk_dv);
-            auto dprops = at::cuda::getCurrentDeviceProperties();
-            // printf("CTAS_PER_SM = %d, nSMs = %d\n", ctas_per_sm, dprops->multiProcessorCount);
-            constexpr int M = Kernel_traits::Cta_tile_p::M;
-            // We don't want more than 10 splits due to numerical error.
-            // Numerical error on dk/dv scales as sqrt(num_splits).
-            params.num_splits = num_splits_heuristic_bwd(
-                params.b * params.h, dprops->multiProcessorCount,
-                ctas_per_sm, params.seqlen_k, blocksize_c, params.is_causal
-            );
-        }
-        if (configure) return;
-        if (params.num_splits == 1) {
-            dim3 grid(params.b, params.h, params.num_splits);
-            kernel<<<grid, Kernel_traits::THREADS, smem_size_dq_dk_dv, stream>>>(params);
-        } else {
-            dim3 grid_dot(params.b, params.h, (params.seqlen_q + 128 - 1) / 128);
-            fmha_bwd_dot_do_o_kernel<Kernel_traits><<<grid_dot, Kernel_traits::THREADS, 0, stream>>>(params);
-            int num_splits = params.seqlen_k / blocksize_c;  // seqlen_k is divisible by blocksize_c
-            dim3 grid(params.b, params.h, num_splits);
-            kernel_seqparallel<<<grid, Kernel_traits::THREADS, smem_size_dq_dk_dv, stream>>>(params);
-        }
-        FMHA_CHECK_CUDA(cudaPeekAtLastError());
-    }));
-}
--- a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
+++ b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
-/* Copyright (c) 2022, Tri Dao.
- */
-
-#pragma once
-
-#include "fmha_fprop_kernel_1xN.h"
-#include "fmha_kernel.h"
-#include <fmha/kernel_traits.h>
-#include <fmha/gemm.h>
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int ROWS, int THREADS_PER_ROW, typename elem_type=__half, int M, typename Gmem_softmax_sum>
-inline __device__ void dot_do_o(const uint4 (&do_)[M], const uint4 (&o)[M], const float scale,
-                                Gmem_softmax_sum gmem_softmax_d, int tidx) {
-    float sum[M];
-    fmha::SumOp<float> sum_op;
-    #pragma unroll
-    for (int mi = 0; mi < M; ++mi) {
-        sum[mi] = fmha::Allreduce<THREADS_PER_ROW>::run(
-            fmha::hmulsum8<elem_type>(do_[mi], o[mi]), sum_op
-        ) * scale;
-    }
-    const int dp_sum_row = tidx / THREADS_PER_ROW;
-    if ((dp_sum_row < ROWS) && (tidx % THREADS_PER_ROW == 0)) {
-        gmem_softmax_d.store_row(reinterpret_cast<const uint32_t (&)[M]>(sum), dp_sum_row);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Just compute dot(do, o) and write the result (softmax_d) to global memory as a separate kernel.
-// This is used in the case where we want to parallelize the backward across seqlen_k.
-template<typename Kernel_traits, typename Params>
-inline __device__ void compute_dot_do_o(const Params &params) {
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
-    using elem_type = typename Kernel_traits::elem_type;
-#else
-    constexpr bool is_fp16_type = std::is_same<typename Kernel_traits::elem_type, __half>::value;
-    assert(is_fp16_type);
-    using elem_type = __half;
-#endif
-
-    // The description of the CTA tile for the 1st batched GEMM.
-    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
-    // The description of the CTA tile for the 3rd batched GEMM.
-    using Cta_tile_dkv =
-        fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;
-
-    static_assert(Cta_tile_dkv::N == 16 || Cta_tile_dkv::N == 32 || Cta_tile_dkv::N == 64 || Cta_tile_dkv::N == 128);
-    static_assert(Cta_tile_dkv::K == 16);
-
-    // The global memory tile to load dO.
-    using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;
-
-    // The global memory tile to load O.Loading O here is similar to loading dO.
-    using Gmem_tile_o = Gmem_tile_do;
-
-    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    // How many steps to jump per iteration.
-    const int step_stride = gridDim.z;
-
-    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
-    if( binfo.stop_early() ) return;
-
-    // Allocate the global memory tile loader for dO.
-    Gmem_tile_do gmem_do(params.do_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                         params.d, binfo, tidx, true);
-
-    // Allocate the global memory tile loader for O.
-    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-
-    Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
-
-    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    const int steps = (params.seqlen_q + Cta_tile_p::M - 1) / Cta_tile_p::M;
-    // Wind gmem tiles to the correct position.
-    gmem_do.move(blockIdx.z);
-    gmem_o.move(blockIdx.z);
-    gmem_softmax_d.move(blockIdx.z);
-
-    // Load over the entire sequence length.
-    for (int l = blockIdx.z; l < steps; l += step_stride) {
-        if (l * Cta_tile_p::M  >= binfo.actual_seqlen_q)
-            break;
-
-        gmem_do.load();
-        gmem_do.move(step_stride);
-        gmem_o.load();
-        gmem_o.move(step_stride);
-
-        dot_do_o<Gmem_tile_do::ROWS, Gmem_tile_do::THREADS_PER_ROW, elem_type>(
-            gmem_do.fetch_, gmem_o.fetch_, params.p_dropout, gmem_softmax_d, tidx
-        );
-        gmem_softmax_d.move(step_stride);
-    }  // Outer loop over the sequence length.
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_first, bool Is_last, bool Seq_parallel=false, typename Params, typename Prng>
-inline __device__ void compute_dq_dk_dv_1xN_one_iter(const Params &params, Prng &ph,
-                                                     const int loop_step_idx) {
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
-    using elem_type = typename Kernel_traits::elem_type;
-#else
-    constexpr bool is_fp16_type = std::is_same<typename Kernel_traits::elem_type, __half>::value;
-    assert(is_fp16_type);
-    using elem_type = __half;
-#endif
-
-    // The description of the CTA tile for the 1st batched GEMM.
-    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
-    // The description of the CTA tile for the 2nd batched GEMM.
-    using Cta_tile_dq = typename Kernel_traits::Cta_tile_o;
-    // The description of the CTA tile for the 3rd batched GEMM.
-    using Cta_tile_dkv =
-        fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;
-
-    static_assert(Cta_tile_dkv::M == 512 ||  Cta_tile_dkv::M == 256 || Cta_tile_dkv::M == 128);
-    static_assert(Cta_tile_dkv::N == 16 || Cta_tile_dkv::N == 32 || Cta_tile_dkv::N == 64 || Cta_tile_dkv::N == 128);
-    static_assert(Cta_tile_dkv::K == 16);
-
-    // The MMA tile for the 1st GEMM.
-    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
-    // The MMA tile for the 2nd GEMM.
-    using Mma_tile_dq = fmha::Hmma_tile<Cta_tile_dq>;
-    // The MMA tile for the 3rd GEMM.
-    using Mma_tile_dkv = fmha::Hmma_tile<Cta_tile_dkv>;
-
-    // The global memory tile to load Q.
-    using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
-    // The shared memory tile to reload Q transposed.
-    using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dkv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
-
-    // The global memory tile to load K.
-    using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
-    // The shared memory tile to swizzle K^T. Treat K^T as V
-    using Smem_tile_kt = typename Kernel_traits::Smem_tile_v;
-
-    // Treating V as K. We need to use Kernel_traits::Smem_tile_k otherwise loading will be wrong
-    // The global memory tile to load V.
-    using Gmem_tile_v = typename Kernel_traits::Gmem_tile_k;
-    // The shared memory tile to swizzle V.
-    using Smem_tile_v = typename Kernel_traits::Smem_tile_k;
-
-    // The global memory tile to load dO.
-    using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;
-    // The shared memory tile to load dO.
-    // Treating dO as Q.
-    using Smem_tile_do = typename Kernel_traits::Smem_tile_q;
-    // The shared memory tile to reload dO transposed.
-    using Smem_tile_dot = fmha::Smem_tile_b<Cta_tile_dkv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
-
-    // The global memory tile to load O.Loading O here is similar to loading dO.
-    using Gmem_tile_o = Gmem_tile_do;
-
-    // The global memory tile to store dQ.
-    using Gmem_tile_dq = typename Kernel_traits::Gmem_tile_o;
-    using Gmem_tile_dq_tmp = fmha::Gmem_tile_o<Cta_tile_dq, 4>;
-    // The shared memory tile to swizzle dQ.
-    using Smem_tile_dq = typename Kernel_traits::Smem_tile_o;
-
-    // The global memory tile to store dV.
-    using Gmem_tile_dv = typename Kernel_traits::Gmem_tile_v;
-    // The shared memory tile to swizzle dV.
-    using Smem_tile_dv = fmha::Smem_tile_mma_epilogue<Cta_tile_dkv>;
-
-    // The global memory tile to store dK.
-    using Gmem_tile_dk = typename Kernel_traits::Gmem_tile_v;
-    // The shared memory tile to swizzle dK.
-    using Smem_tile_dk = fmha::Smem_tile_mma_epilogue<Cta_tile_dkv>;
-    static_assert(Smem_tile_dk::NUM_LDS == Gmem_tile_dk::LDGS);
-    static_assert(Smem_tile_dk::THREADS_PER_ROW == Gmem_tile_dk::THREADS_PER_ROW);
-
-    using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
-
-    using Smem_tile_st = typename Kernel_traits::Smem_tile_st;
-
-    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
-
-    // using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
-    using Gemm1 = Gemm_Q_K<Kernel_traits, /*K-in_regs=*/false, elem_type>;
-
-    using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-    // Shared memory layout if we keep V in registers:
-    //  dO | Q | K / V | dQ | S | dP | dP_sum
-    //  dV | dK
-    // Shared memory layout if we keep V shared memory:
-    //  dO | Q | K | V | dQ | S | dP | dP_sum
-    //  dV | dK
-
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
-    // if( binfo.stop_early() ) return;
-    if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
-
-    Gemm1 gemm_q_k(&smem_[Smem_tile_do::BYTES_PER_TILE], tidx);
-    // Allocate the global memory tile loader for Q.
-    Gmem_tile_q gmem_q(params.q_ptr, params.q_row_stride_in_elts, params.q_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-    // Allocate the global memory tile loader for dQ.
-    Gmem_tile_dq gmem_dq(params.dq_ptr, params.dq_row_stride_in_elts, params.dq_head_stride_in_elts,
-                         params.d, binfo, tidx);
-    Gmem_tile_dq_tmp gmem_dq_tmp(params.o_tmp_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                                 params.d, binfo, tidx);
-    // Allocate the global memory tile loader for S.
-    Gmem_tile_s gmem_s(params, binfo, tidx);
-
-    fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
-
-    // Allocate the global memory tile loader for K.
-    Gmem_tile_k gmem_k(params.k_ptr, params.k_row_stride_in_elts, params.k_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // Allocate the global memory tile loader for V.
-    Gmem_tile_v gmem_v(params.v_ptr, params.v_row_stride_in_elts, params.v_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // The base pointer of smem_v;
-    char *smem_v_ = &smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_V];
-
-    // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
-    Smem_tile_v smem_v(smem_v_, tidx);
-    // Allocate the shared memory tile loader for K^T. We use the same as K so be careful!!!
-    Smem_tile_kt smem_kt(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::Smem_tile_q::BYTES_PER_TILE], tidx);
-
-    // Allocate the global memory tile loader for dO.
-    Gmem_tile_do gmem_do(params.do_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                         params.d, binfo, tidx, true);
-    // Allocate the shared memory tile loader for dO.
-    Smem_tile_do smem_do(&smem_[0], tidx);
-    Smem_tile_dot smem_dot(&smem_[0], tidx);
-    // Allocate the shared memory tile loader for Q^T.
-    // TODO: assert that this points to the same memory as gemm_q_k.smem_q
-    Smem_tile_qt smem_qt(&smem_[Smem_tile_do::BYTES_PER_TILE], tidx);
-
-    Smem_tile_st smem_s(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE], tidx);
-    Smem_tile_st smem_dp(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE + Smem_tile_st::BYTES_PER_TILE], tidx);
-
-    // Allocate the global memory tile loader for O.
-    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-
-    // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
-    Smem_tile_dq smem_dq(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O], tidx);
-
-    Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
-    Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
-
-    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int begin = Is_causal ? loop_step_idx * Cta_tile_p::N / Cta_tile_p::M : 0;
-    // Otherwise we'd be reading out-of-bound memory before the loop
-    if (begin * Cta_tile_p::M >= binfo.actual_seqlen_q) {
-        // Still need to zero out dk and dv before returning
-        static_assert(Smem_tile_dk::NUM_LDS == Smem_tile_dv::NUM_LDS);
-        uint4 dkv_out[Smem_tile_dk::NUM_LDS];
-        #pragma unroll
-        for (int i = 0; i < Smem_tile_dk::NUM_LDS; ++i) { dkv_out[i] = make_uint4(0u, 0u, 0u, 0u); }
-        Gmem_tile_dk gmem_dk(params.dk_ptr, params.dk_row_stride_in_elts, params.dk_head_stride_in_elts,
-                            params.d, binfo, tidx, false);
-        if (!Is_first) { gmem_dk.move(loop_step_idx); }
-        gmem_dk.store(dkv_out);
-        Gmem_tile_dv gmem_dv(params.dv_ptr, params.dv_row_stride_in_elts, params.dv_head_stride_in_elts,
-                            params.d, binfo, tidx, false);
-        if (!Is_first) { gmem_dv.move(loop_step_idx); }
-        gmem_dv.store(dkv_out);
-        return;
-    }
-
-    const int steps = (params.seqlen_q + Cta_tile_p::M - 1) / Cta_tile_p::M - begin;
-    // Wind gmem tiles to the correct position.
-    gmem_q.move(begin);
-    gmem_do.move(begin);
-    gmem_o.move(begin);
-    if (!Seq_parallel) { gmem_dq.move(begin); }  // If Seq_parallel, we're not using gmem_dq at all
-    gmem_dq_tmp.move(begin);
-    // TODO: need to move gmem_s if we want the intermediate result for debugging
-    gmem_softmax_lse.move(begin);
-    gmem_softmax_d.move(begin);
-
-    if (!Is_first) {
-        gmem_k.move(loop_step_idx);
-        gmem_v.move(loop_step_idx);
-    }
-
-    // Trigger the loads for K.
-    gmem_k.load();
-    // Trigger the loads for Q.
-    gmem_q.load();
-    // Trigger the loads for V.
-    gmem_v.load();
-    // Trigger the loads for dO.
-    gmem_do.load();
-    // Trigger the loads for O.
-    if (Is_first) { gmem_o.load(); }
-
-    float p_lse[Mma_tile_p::MMAS_M * 2];
-    gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_lse));
-
-    if (!Is_first) { __syncthreads(); }
-    // Commit the data for Q, dO, and V to shared memory.
-    gmem_q.commit(gemm_q_k.smem_q);
-    gmem_do.commit(smem_do);
-    if (Is_first) {
-        dot_do_o<Gmem_tile_do::ROWS, Gmem_tile_do::THREADS_PER_ROW, elem_type>(
-            gmem_do.fetch_, gmem_o.fetch_, params.p_dropout, gmem_softmax_d, tidx
-        );
-    }
-
-    // // Instead of scaling dP by rp_dropout, we scale V instead
-    // if (Is_dropout) {
-    //     const uint32_t scale_dropout = params.scale_dropout;
-    //     #pragma unroll
-    //     for(int it=0; it < Gmem_tile_v::LDGS; it++){
-    //         gmem_v.fetch_[it] = fmha::hmul8(scale_dropout, gmem_v.fetch_[it]);
-    //     }
-    // }
-
-    gmem_v.commit(smem_v);
-
-    // const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
-    // #pragma unroll
-    // for(int it=0; it < Gmem_tile_k::LDGS; it++){
-    //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
-    // }
-
-    // Commit the data for K to shared memory.
-    if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        gmem_k.commit(gemm_q_k.smem_k);
-    }
-
-    __syncthreads();
-
-    // Load the fragments for Q.
-    gemm_q_k.load_q();
-
-    // Load the fragments for V. We keep the data in registers during the entire kernel.
-    typename Smem_tile_v::Fragment frag_v[Kernel_traits::V_IN_REGS ? Mma_tile_p::MMAS_K : 2][Mma_tile_p::MMAS_N];
-    if (Kernel_traits::V_IN_REGS) {
-        #pragma unroll
-        for( int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            smem_v.load(frag_v[ki], ki);
-        }
-    }
-
-    float dp_sum[Mma_tile_p::MMAS_M * 2];
-    gmem_softmax_d.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(dp_sum));
-
-    // Commit the data for V to shared memory if it has not been done already.
-    if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        // Make sure we are done loading the fragments for K.
-        __syncthreads();
-
-        // Commit the data to shared memory for V.
-        gmem_k.commit(gemm_q_k.smem_k);
-
-        // Make sure the data is in shared memory.
-        __syncthreads();
-    }
-
-    // Load the fragments for K.
-    gemm_q_k.load_k();
-    // Load the fragments for K^T.
-    // typename Smem_tile_kt::Fragment frag_kt[2][Mma_tile_dq::MMAS_N];
-    // smem_kt.load(frag_kt[0], 0);
-    // typename Smem_tile_kt::Fragment frag_kt[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_N];
-    // #pragma unroll
-    // for( int ki = 0; ki < Mma_tile_dq::MMAS_K; ++ki ) {
-    //     smem_kt.load(frag_kt[ki], ki);
-    // }
-
-    // Create the object to do the softmax.
-    // We won't be using the shared memory for this softmax at all
-    Softmax softmax(params, smem_, tidx);
-
-    // Declare the accumulators for the 3rd gemm.
-    fmha::Fragment_accumulator acc_dv[Mma_tile_dkv::MMAS_M][Mma_tile_dkv::MMAS_N];
-    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dkv::WARPS_K>::apply(acc_dv);
-    fmha::Fragment_accumulator acc_dk[Mma_tile_dkv::MMAS_M][Mma_tile_dkv::MMAS_N];
-    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dkv::WARPS_K>::apply(acc_dk);
-
-    // Load over the entire sequence length.
-    for (int l = 0; l < steps; l++) {
-        if ((begin + l) * Cta_tile_p::M  >= binfo.actual_seqlen_q)
-            break;
-
-        // Load the fragments for V.
-        // typename Smem_tile_v::Fragment frag_v[2][Mma_tile_p::MMAS_N];
-        if (!Kernel_traits::V_IN_REGS) { smem_v.load(frag_v[0], 0); }
-
-        // Load the fragments for dO.
-        typename Smem_tile_do::Fragment frag_do[2][Mma_tile_p::MMAS_M];
-        smem_do.load(frag_do[0], 0);
-
-        // Declare the accumulators for the 1st gemm.
-        fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
-
-        // Do this part of P^T = (Q * K^T)^T.
-        gemm_q_k(acc_p);
-
-        // Load the mask for that iteration.
-        mask.load(begin + l);
-
-        // Convert from the accumulator type to FP32 for Softmax.
-        softmax.unpack_noscale(acc_p);
-        // Apply the mask.
-        softmax.apply_mask(mask);
-        // Scale by log-sum-exp of the softmax
-        // softmax.apply_exp(p_lse);
-        softmax.template scale_apply_exp</*scale_max=*/false>(p_lse, params.scale_bmm1f);
-        if (Is_dropout) {
-            // softmax.apply_dropout(ph, params.p_dropout_in_uint);
-            // softmax.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint);
-            // softmax.template apply_dropout_16bits</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint16_t);
-            unsigned int warp_idx = threadIdx.x / 32;
-            // TODO: this should change after we rearrange the warps (e.g. cutlass branch)
-            unsigned int block_col_idx = loop_step_idx * Cta_tile_p::N / 16 + warp_idx;
-            unsigned long long philox_subsequence = (begin + l) * (binfo.actual_seqlen_k / 16) + block_col_idx;
-            softmax.template apply_dropout_16bits</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint16_t, philox_subsequence);
-        }
-
-        using Frag_p = fmha::Fragment_a<fmha::Row>;
-        Frag_p frag_p[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_M];
-        static_assert(Mma_tile_dq::MMAS_M == Mma_tile_p::MMAS_M);
-        static_assert(Mma_tile_dq::MMAS_K == Mma_tile_p::MMAS_N);
-        softmax.template pack<elem_type>(frag_p);
-
-        // Store s * dmask to smem for transpose
-        smem_s.store(frag_p);
-
-        // Trigger the load for the next Q values.
-        if (l + 1 < steps) {
-            gemm_q_k.smem_q.move_to_next_write_buffer();
-            gmem_q.move();
-            gmem_q.load();
-        }
-
-        // if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0 ) {
-        //     // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
-        //     __syncthreads();
-        // }
-
-        fmha::Fragment_accumulator acc_dp[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
-        #pragma unroll
-        for (int mi = 0; mi < Mma_tile_p::MMAS_M; ++mi) {
-            #pragma unroll
-            for (int ni = 0; ni < Mma_tile_p::MMAS_N; ++ni) {
-                #pragma unroll
-                for (int ii = 0; ii < 8; ++ii) {
-                    acc_dp[mi][ni].elt(ii) = -dp_sum[mi * 2 + ((ii / 2) % 2)];
-                }
-            }
-        }
-
-        // Do this part of dP^T = (dO * V^T)^T.
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of dO values.
-            smem_do.load(frag_do[ki & 1], ki);
-            if (!Kernel_traits::V_IN_REGS) {
-                smem_v.load(frag_v[ki & 1], ki);
-                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1) & 1]);
-            } else {
-                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[ki - 1]);
-            }
-            // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l < 4))  {
-            //     float2 tmp = __half22float2(reinterpret_cast<__half2 &>(frag_do[(ki - 1) & 1]));
-            //     printf("frag_do=%.6f, %.6f\n", tmp.x, tmp.y);
-            //     tmp = __half22float2(reinterpret_cast<__half2 &>(frag_v[(ki - 1) & 1]));
-            //     printf("frag_v=%.6f, %.6f\n", tmp.x, tmp.y);
-            // }
-        }
-
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_p::MMAS_K;
-            if (!Kernel_traits::V_IN_REGS) {
-                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1) & 1]);
-            } else {
-                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1)]);
-            }
-        }
-
-        auto pointwise_mult = [](float p, float dp, float d) {
-            return p * ((!Is_dropout) || p >= 0.f ? dp : d);
-        };
-        #pragma unroll
-        for (int mi = 0; mi < Mma_tile_p::MMAS_M; mi++) {
-            #pragma unroll
-            for (int ni = 0; ni < Mma_tile_p::MMAS_N; ni++) {
-                softmax.elt_[2 * mi + 0][4 * ni + 0] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 0], acc_dp[mi][ni].elt(0), dp_sum[2 * mi + 0]);
-                softmax.elt_[2 * mi + 0][4 * ni + 1] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 1], acc_dp[mi][ni].elt(1), dp_sum[2 * mi + 0]);
-                softmax.elt_[2 * mi + 0][4 * ni + 2] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 2], acc_dp[mi][ni].elt(4), dp_sum[2 * mi + 0]);
-                softmax.elt_[2 * mi + 0][4 * ni + 3] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 3], acc_dp[mi][ni].elt(5), dp_sum[2 * mi + 0]);
-                softmax.elt_[2 * mi + 1][4 * ni + 0] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 0], acc_dp[mi][ni].elt(2), dp_sum[2 * mi + 1]);
-                softmax.elt_[2 * mi + 1][4 * ni + 1] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 1], acc_dp[mi][ni].elt(3), dp_sum[2 * mi + 1]);
-                softmax.elt_[2 * mi + 1][4 * ni + 2] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 2], acc_dp[mi][ni].elt(6), dp_sum[2 * mi + 1]);
-                softmax.elt_[2 * mi + 1][4 * ni + 3] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 3], acc_dp[mi][ni].elt(7), dp_sum[2 * mi + 1]);
-            }
-        }
-
-        // Load the fragments for K^T.
-        typename Smem_tile_kt::Fragment frag_kt[2][Mma_tile_dq::MMAS_N];
-        smem_kt.load(frag_kt[0], 0);
-
-        // Trigger the load for the next dO values.
-        if (l + 1 < steps) {
-            smem_do.move_to_next_write_buffer();
-            gmem_do.move();
-            gmem_do.load();
-            if (Is_first) {
-                gmem_o.move();
-                gmem_o.load();
-            }
-        }
-
-        softmax.template pack<elem_type>(frag_p);
-
-        // Store dp to smem for transpose
-        smem_dp.store(frag_p);
-
-        // gmem_s.store(frag_p, mask);
-        // gmem_s.move();
-
-        // Declare the accumulators for the 2nd gemm.
-        fmha::Fragment_accumulator acc_dq[Mma_tile_dq::MMAS_M][Mma_tile_dq::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_dq::WARPS_K>::apply(acc_dq);
-
-        // Do this part of O = P^T * V^T.
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_dq::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            smem_kt.load(frag_kt[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1) & 1]);
-            // fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1)]);
-        }
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_dq::MMAS_K;
-            fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1) & 1]);
-            // fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1)]);
-        }
-
-        static_assert(Gmem_tile_dq::LOOPS == 1);
-
-        // Swizzle the elements and do the final reduction.
-        // Need to syncthreads here, otherwise the smem_dq reads from the previous iteration
-        // might happen after the smem_dq writes in this iteration.
-        __syncthreads();
-        smem_dq.store(acc_dq, 0);
-
-        typename Smem_tile_dot::Fragment frag_dot[2][Mma_tile_dkv::MMAS_N];
-        static_assert(Smem_tile_dot::Fragment::NUM_REGS == 4);
-        static_assert(Mma_tile_dkv::MMAS_K == 1);
-        smem_dot.load(frag_dot[0], 0);
-
-        // Threads in a warp is communicating via shared memory (smem_s and smem_dp)
-        __syncwarp();
-        typename Smem_tile_st::Fragment frag_s[Mma_tile_dkv::MMAS_K][Mma_tile_dkv::MMAS_M];
-        smem_s.load(frag_s);
-
-        if (Is_dropout) {
-            #pragma unroll
-            for( int ki = 0; ki < Mma_tile_dkv::MMAS_K; ki++ ) {
-                #pragma unroll
-                for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
-                    frag_s[ki][mi].template hrelu_<elem_type>();
-                }
-            }
-        }
-
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_dkv::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            smem_dot.load(frag_dot[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<elem_type>(acc_dv, frag_s[(ki - 1)], frag_dot[(ki - 1) & 1]);
-        }
-
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_dkv::MMAS_K;
-            fmha::gemm_cl<elem_type>(acc_dv, frag_s[(ki - 1)], frag_dot[(ki - 1) & 1]);
-        }
-
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
-        //     float2 tmp0 = __half22float2(reinterpret_cast<__half2 &>(frag_dot[0][0]));
-        //     printf("frag_dot[0][0]=%.6f, %.6f\n", tmp0.x, tmp0.y);
-        //     float2 tmp1 = __half22float2(reinterpret_cast<__half2 &>(frag_dot[0][1]));
-        //     printf("frag_dot[0][1]=%.6f, %.6f\n", tmp1.x, tmp1.y);
-        // }
-
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
-        //     printf("l = %d, acc_dv[0][0]=%.6f, %.6f\n", l, acc_dv[0][0].elt(2), acc_dv[0][0].elt(3));
-        //     printf("l = %d, acc_dv[0][1]=%.6f, %.6f\n", l, acc_dv[0][1].elt(2), acc_dv[0][1].elt(3));
-        // }
-        // __syncthreads();
-        // Commit the values for Q and dO into shared memory.
-        if (l + 1 < steps) {
-            gmem_q.commit(gemm_q_k.smem_q);
-        }
-
-        uint4 dq_out[Gmem_tile_dq::STGS_PER_LOOP];
-        if (!Is_first && !Seq_parallel) { gmem_dq_tmp.load(dq_out, 0); }
-
-        // __syncthreads();
-        // Commit the values for Q and dO into shared memory.
-        if (l + 1 < steps) {
-            gmem_do.commit(smem_do);
-            gmem_softmax_d.move();
-            if (Is_first) {
-                dot_do_o<Gmem_tile_do::ROWS, Gmem_tile_do::THREADS_PER_ROW, elem_type>(
-                    gmem_do.fetch_, gmem_o.fetch_, params.p_dropout, gmem_softmax_d, tidx
-                );
-            }
-            gmem_softmax_lse.move();
-            gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_lse));
-        }
-
-        typename Smem_tile_st::Fragment frag_dpt[Mma_tile_dkv::MMAS_K][Mma_tile_dkv::MMAS_M];
-        smem_dp.load(frag_dpt);
-
-        gemm_q_k.reload_k();
-
-        typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dkv::MMAS_N];
-        static_assert(Smem_tile_qt::Fragment::NUM_REGS == 4);
-        static_assert(Mma_tile_dkv::MMAS_K == 1);
-        smem_qt.load(frag_qt[0], 0);
-
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_dkv::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            smem_qt.load(frag_qt[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<elem_type>(acc_dk, frag_dpt[(ki - 1)], frag_qt[(ki - 1) & 1]);
-        }
-
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_dkv::MMAS_K;
-            fmha::gemm_cl<elem_type>(acc_dk, frag_dpt[(ki - 1)], frag_qt[(ki - 1) & 1]);
-        }
-
-        // Make sure dQ is in shared memory.
-        __syncthreads();
-
-        if (l + 1 < steps) {
-            gmem_softmax_d.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(dp_sum));
-        }
-
-        // Load from shared memory.
-        smem_dq.template load</*zero_init=*/Is_first || Seq_parallel>(dq_out);
-
-        if (!Seq_parallel) {
-            const bool is_final_write =
-                Is_last
-                || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
-                || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
-            if (is_final_write) {
-                // if (Is_dropout) {
-                //     dq_out[0] = fmha::fmul4(dq_out[0], params.rp_dropout);
-                // }
-                for (int jj = 0; jj < Gmem_tile_dq::STGS_PER_LOOP; ++jj) {
-                    // dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1f);
-                    dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1_rp_dropout);
-                }
-                // Output the values.
-                gmem_dq.template store<elem_type>(dq_out, 0);
-                // Move to the next part of the output.
-                gmem_dq.move();
-                // TODO: for parallel, need to deal with the dropout scaling
-            } else  {
-                // Output the values.
-                gmem_dq_tmp.store(dq_out, 0);
-            }
-        } else {
-            // We always scale dq_out before writing in this case, since we don't want to
-            // have to scale at the end when copying from dq_tmp to dq.
-            for (int jj = 0; jj < Gmem_tile_dq::STGS_PER_LOOP; ++jj) {
-                // dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1f);
-                dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1_rp_dropout);
-            }
-            gmem_dq_tmp.atomic_add(dq_out, 0);
-        }
-
-        // Move to the next part of the output.
-        if (!(Is_first && Is_last)) { gmem_dq_tmp.move(); }
-
-        // // Make sure the data is in shared memory.
-        // __syncthreads();
-
-        // Commit the values for Q and dO into shared memory.
-        if (l + 1 < steps) {
-            gemm_q_k.smem_q.move_to_next_read_buffer();
-            gemm_q_k.reload_q();
-            smem_qt.move_to_next_read_buffer();
-            // smem_qt.load(frag_qt[0], 0);
-            smem_do.move_to_next_read_buffer();
-            smem_dot.move_to_next_read_buffer();
-            // smem_dot.load(frag_dot[0], 0);
-        }
-
-    }  // Outer loop over the sequence length.
-
-    if (Is_dropout) {
-        for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
-            for( int ni = 0; ni < Mma_tile_dkv::MMAS_N; ni++ ) {
-                acc_dv[mi][ni].mul_(params.rp_dropout);
-            }
-        }
-    }
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
-    //     printf("l final, acc_dv[0][0]=%.6f, %.6f\n", acc_dv[0][0].elt(2), acc_dv[0][0].elt(3));
-    //     printf("l final, acc_dv[0][1]=%.6f, %.6f\n", acc_dv[0][1].elt(2), acc_dv[0][1].elt(3));
-    // }
-    for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
-        for( int ni = 0; ni < Mma_tile_dkv::MMAS_N; ni++ ) {
-            // acc_dk[mi][ni].mul_(Is_dropout ? params.rp_dropout * params.scale_bmm1f : params.scale_bmm1f);
-            // acc_dk[mi][ni].mul_(params.scale_bmm1f);
-            acc_dk[mi][ni].mul_(params.scale_bmm1_rp_dropout);
-        }
-    }
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
-    //     printf("l final, acc_dk=%.6f, %.6f\n", acc_dk[0][0].elt(0), acc_dk[0][0].elt(1));
-    // }
-
-    __syncthreads();
-    // TODO [TD - 2022-05-04]: Are there cases where the shared mem for dV and dK are larger than
-    // the total amount of shared mem?
-    // Epilogue swizzle for dV
-    Smem_tile_dv smem_dv(&smem_[0], tidx);
-    smem_dv.template store<elem_type>(acc_dv);
-
-    // Epilogue swizzle for dK
-    Smem_tile_dk smem_dk(&smem_[Smem_tile_dv::BYTES_PER_TILE], tidx);
-    smem_dk.template store<elem_type>(acc_dk);
-
-    __syncthreads();
-    uint4 dv_out[Smem_tile_dv::NUM_LDS];
-    smem_dv.load(dv_out);
-    Gmem_tile_dv gmem_dv(params.dv_ptr, params.dv_row_stride_in_elts, params.dv_head_stride_in_elts,
-                         params.d, binfo, tidx, false);
-    if (!Is_first) {
-        gmem_dv.move(loop_step_idx);
-    }
-    gmem_dv.store(dv_out);
-
-    uint4 dk_out[Smem_tile_dk::NUM_LDS];
-    smem_dk.load(dk_out);
-    Gmem_tile_dk gmem_dk(params.dk_ptr, params.dk_row_stride_in_elts, params.dk_head_stride_in_elts,
-                         params.d, binfo, tidx, false);
-    if (!Is_first) {
-        gmem_dk.move(loop_step_idx);
-    }
-    gmem_dk.store(dk_out);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// loop_steps = -1 means the number of steps will be params.seqlen_k / Kernel_traits::Cta_tile_p::N.
-// This template parameter is there so we can specialize with loop_steps == 1 and loop_steps == 2.
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, int loop_steps=-1, typename Params>
-inline __device__ void compute_dq_dk_dv_1xN(const Params &params) {
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    auto seed = params.rng_state[0];
-    auto offset = params.rng_state[1];
-    Philox ph(seed, 0, offset + (bidb * params.h + bidh) * 32 + tidx % 32);
-
-    if (loop_steps == 1) {
-        compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
-    } else if (loop_steps == 2) {
-        compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, false>(params, ph, 0);
-        compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, true>(params, ph, 1);
-    } else {
-        if (params.seqlen_k == blocksize_c) {
-            compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
-        } else {
-            const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
-            compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, false>(params, ph, 0);
-            for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
-                compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, false>(params, ph, loop_step_idx);
-            }
-            compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, true>(params, ph, max_loop_steps - 1);
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, typename Params>
-inline __device__ void compute_dq_dk_dv_seqparallel(const Params &params) {
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    auto seed = params.rng_state[0];
-    auto offset = params.rng_state[1];
-    Philox ph(seed, 0, offset + (bidb * params.h + bidh) * 32 + tidx % 32);
-
-    int loop_step_idx = blockIdx.z;
-    compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, false, /*Seq_parallel=*/true>(params, ph, loop_step_idx);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace fmha
--- a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
+++ b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
-/***************************************************************************************************
- * Copyright (c) 2022, Tri Dao.
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "fmha_kernel.h"
-#include <fmha/kernel_traits.h>
-#include <fmha/gemm.h>
-#include <fmha/utils.h>
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits>
-struct Gemm_Q_K_base {
-    using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
-    using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
-    using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
-    using Fragment_q = typename Smem_tile_q::Fragment;
-    using Fragment_k = typename Smem_tile_k::Fragment;
-
-    // The description of the CTA tile for the 1st batched GEMM.
-    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
-
-    // The MMA tile for the 1st GEMM.
-    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
-
-    static constexpr int SMEM_BYTES_SOFTMAX = Cta_tile_p::M * Cta_tile_p::WARPS_N * sizeof(float) * 2;
-
-    __device__ inline Gemm_Q_K_base(char * smem_ptr_q, char * smem_ptr_k, const int tidx) 
-        : smem_q(smem_ptr_q, tidx)
-        , smem_k(smem_ptr_k, tidx) {
-
-    }
-
-    __device__ inline void load_q() {
-        smem_q.load(frag_q[0], 0);
-    }
-
-    __device__ inline void reload_q() {
-        smem_q.load(frag_q[0], 0);
-    }
-
-    Fragment_q frag_q[2][Mma_tile_p::MMAS_M];
-    Smem_tile_q smem_q;
-    Smem_tile_k smem_k;
-};
-
-template<typename Kernel_traits, bool K_in_regs, typename elem_type_=__half>
-struct Gemm_Q_K : public Gemm_Q_K_base<Kernel_traits> {
-
-    using Base = Gemm_Q_K_base<Kernel_traits>;
-    using Smem_tile_o = typename Base::Smem_tile_o;
-    using Smem_tile_q = typename Base::Smem_tile_q;
-    using Smem_tile_k = typename Base::Smem_tile_k;
-    using Fragment_k = typename Base::Fragment_k;
-    using Mma_tile_p = typename Base::Mma_tile_p;
-    using elem_type = elem_type_;
-
-    static constexpr bool SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
-    // If V is stored in shared memory, we can't load K using the same shared memory.
-    static_assert(Kernel_traits::V_IN_REGS);
-
-    static constexpr int SMEM_OFFSET_O = Smem_tile_q::BYTES_PER_TILE;
-    static constexpr int SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE;
-    static constexpr int SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE + (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE);
-
-    // Q | K / V
-    //   | O | SOFTMAX
-    static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE 
-                                    + std::max((SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE,
-                                               Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX);
-
-    __device__ inline Gemm_Q_K(char * smem_, const int tidx) 
-        : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {
-    }
-
-    __device__ inline void load_k(){
-        #pragma unroll
-        for( int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            Base::smem_k.load(frag_k[ki], ki);
-        }
-    }
-
-    template<typename Acc, int M, int N>
-    __device__ inline void operator()(Acc (&acc_p)[M][N]){
-        // Do this part of P^T = (Q * K^T)^T.
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            Base::smem_q.load(Base::frag_q[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
-        }
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_p::MMAS_K;
-            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
-        }
-    }
-
-    __device__ inline void reload_k(){
-        // Noop.
-    }
-
-    Fragment_k frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
-};
-
-
-template<typename Kernel_traits, typename elem_type_>
-struct Gemm_Q_K<Kernel_traits, false, elem_type_> : public Gemm_Q_K_base<Kernel_traits> {
-    using Base = Gemm_Q_K_base<Kernel_traits>;
-    using Smem_tile_o = typename Base::Smem_tile_o;
-    using Smem_tile_q = typename Base::Smem_tile_q;
-    using Smem_tile_k = typename Base::Smem_tile_k;
-    using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
-    using Fragment_k = typename Base::Fragment_k;
-    using Mma_tile_p = typename Base::Mma_tile_p;
-    using elem_type = elem_type_;
-    Fragment_k frag_k[2][Mma_tile_p::MMAS_N];
-
-    static constexpr bool SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
-    static constexpr bool V_IN_REGS = Kernel_traits::V_IN_REGS;
-    static_assert(V_IN_REGS || !SHARE_SMEM_FOR_K_AND_V);
-
-    static constexpr int SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE + (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE);
-    static_assert(Smem_tile_v::BYTES_PER_TILE == (int) Smem_tile_k::BYTES_PER_TILE);
-    static constexpr int SMEM_OFFSET_O = SMEM_OFFSET_V + Smem_tile_v::BYTES_PER_TILE;
-    static constexpr int SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE;
-
-    // If V_IN_REGS and SHARE_SMEM_FOR_K_AND_V:      Q | K/V | O | SOFTMAX
-    // If !V_IN_REGS (then !SHARE_SMEM_FOR_K_AND_V): Q | K   | V | O | SOFTMAX
-    static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE
-                                    + (SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE 
-                                    + Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX;
-
-    __device__ inline Gemm_Q_K(char * smem_, const int tidx) 
-      : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {
-    }
-
-    __device__ inline void load_k(){
-        Base::smem_k.load(frag_k[0], 0);
-    }
-
-    template<typename Acc, int M, int N>
-    __device__ inline void operator()(Acc (&acc_p)[M][N]){
-        // Do this part of P^T = (Q * K^T)^T.
-        #pragma unroll
-        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
-            // Trigger the load from shared memory for the next series of Q values.
-            Base::smem_q.load(Base::frag_q[ki & 1], ki);
-            Base::smem_k.load(frag_k[ki & 1], ki);
-            // Do the math for the values already in registers.
-            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
-        }
-        // Do the final stage of math.
-        {
-            int ki = Mma_tile_p::MMAS_K;
-            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
-        }
-    }
-
-    __device__ inline void reload_k(){
-        Base::smem_k.load(frag_k[0], 0);
-    }
-};
-
-template<typename Kernel_traits>
-constexpr size_t get_dynamic_smem_size(){
-    return Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>::SMEM_BYTES;
-}
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, bool Is_first, bool Is_last, typename Params, typename Prng>
-inline __device__ void device_1xN_(const Params &params, const int bidb, const int bidh, int steps, Prng &ph, const int loop_step_idx) {
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
-    using elem_type = typename Kernel_traits::elem_type;
-#else
-    constexpr bool is_fp16_type = std::is_same<typename Kernel_traits::elem_type, __half>::value;
-    assert(is_fp16_type);
-    using elem_type = __half;
-#endif
-
-    // The description of the CTA tile for the 1st batched GEMM.
-    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
-    // The description of the CTA tile for the 2nd batched GEMM.
-    using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
-
-    // The MMA tile for the 1st GEMM.
-    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
-    // The MMA tile for the 2nd GEMM.
-    using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
-
-    // The global memory tile to load Q.
-    using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
-
-    // The global memory tile to load K.
-    using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
-
-    // The global memory tile to load V.
-    using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
-    // The shared memory tile to swizzle V.
-    using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
-
-    // The global memory tile to store O.
-    using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
-    using Gmem_tile_o_tmp = fmha::Gmem_tile_o<Cta_tile_o, 4>;
-    // The shared memory tile to swizzle O.
-    using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
-
-    using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
-
-    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
-
-    using Smem_softmax_sum = typename Kernel_traits::Smem_dp_sum;
-
-    using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS, elem_type>;
-
-    using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    // How many steps to jump per iteration, which is the same as params.num_splits.
-    const int step_stride = gridDim.z;
-
-    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
-    // if( binfo.stop_early() ) return;
-    if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
-
-    Gemm1 gemm_q_k(smem_, tidx);
-    // Allocate the global memory tile loader for Q.
-    Gmem_tile_q gmem_q(params.q_ptr, params.q_row_stride_in_elts, params.q_head_stride_in_elts,
-                       params.d, binfo, tidx, true);
-    // Allocate the global memory tile loader for O.
-    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
-                       params.d, binfo, tidx);
-    Gmem_tile_o_tmp gmem_o_tmp(params.o_tmp_ptr, params.o_tmp_row_stride_in_elts,
-                               params.o_tmp_head_stride_in_elts, params.d, binfo, tidx);
-    // Allocate the global memory tile loader for S.
-    Gmem_tile_s gmem_s(params, binfo, tidx);
-    Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
-
-    // Wind gmem tiles to the correct position.
-    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
-    int begin = Is_causal ? loop_step_idx * Cta_tile_p::N / Cta_tile_p::M : 0;
-    // We want begin to be a multiple of gridDim.z
-    // This is because the row indices processed by each threadblock must align between the
-    // loop steps, otherwise we have a dependency between the blocks.
-    // For example, threadblock with blockIdx.z == 1 must process row indices that are
-    // k * gridDim.z + 1 for integer k.
-    const int begin_mod_z = begin % gridDim.z;
-    begin = begin_mod_z <= blockIdx.z ? begin - begin_mod_z : begin + gridDim.z - begin_mod_z;
-    // Otherwise we'd be reading out-of-bound memory before the loop
-    if ((begin + blockIdx.z) * Cta_tile_p::M >= binfo.actual_seqlen_q) return;
-    const int steps_og = steps;
-    steps -= begin;
-    gmem_q.move(begin + blockIdx.z);
-    gmem_o.move(begin + blockIdx.z);
-    gmem_o_tmp.move(begin + blockIdx.z);
-    if (Return_softmax) {
-        gmem_s.move(begin + blockIdx.z);
-    }
-    gmem_softmax_lse.move(begin + blockIdx.z);
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("begin = %d, steps = %d\n", begin, steps);
-    // }
-
-    fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
-
-    // Allocate the global memory tile loader for K.
-    Gmem_tile_k gmem_k(params.k_ptr, params.k_row_stride_in_elts, params.k_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // Allocate the global memory tile loader for V.
-    Gmem_tile_v gmem_v(params.v_ptr, params.v_row_stride_in_elts, params.v_head_stride_in_elts,
-                       params.d, binfo, tidx, false);
-    // The base pointer of smem_v;
-    char *smem_v_ = &smem_[Gemm1::SMEM_OFFSET_V];
-    
-    // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
-    Smem_tile_v smem_v(smem_v_, tidx);
-
-    // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
-    Smem_tile_o smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
-
-    if (!Is_first) {
-        gmem_k.move(loop_step_idx);
-        gmem_v.move(loop_step_idx);
-        if (Return_softmax) { gmem_s.move(loop_step_idx * steps_og); }
-    }
-
-    // Trigger the loads for K.
-    gmem_k.load();
-    // Trigger the loads for Q.
-    gmem_q.load();
-    // Trigger the loads for V.
-    gmem_v.load();
-
-    if (!Is_first) { __syncthreads(); }
-
-    float p_prev_lse[Mma_tile_p::MMAS_M * 2];
-    if (!Is_first) {
-        gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
-    }
-
-    // Commit the data for Q and V to shared memory.
-    gmem_q.commit(gemm_q_k.smem_q);
-    gmem_v.commit(smem_v);
-
-    // const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
-    // #pragma unroll
-    // for(int it=0;it < Gmem_tile_k::LDGS;it++){
-    //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
-    // }
-
-    // Commit the data for K to shared memory.
-    if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        gmem_k.commit(gemm_q_k.smem_k);
-    }
-
-    __syncthreads();
-
-    // Load the fragments for Q.
-    gemm_q_k.load_q();
-
-    // Load the fragments for V. We keep the data in registers during the entire kernel.
-    typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
-    #pragma unroll
-    for( int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki ) {
-        smem_v.load(frag_v[ki], ki);
-    }
-
-    // Commit the data for V to shared memory if it has not been done already.
-    if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        // Make sure we are done loading the fragments for K.
-        __syncthreads();
-
-        // Commit the data to shared memory for V.
-        gmem_k.commit(gemm_q_k.smem_k);
-
-        // Make sure the data is in shared memory.
-        __syncthreads();
-    }
-
-    // Load the fragments for K. 
-    gemm_q_k.load_k();
-
-    // Create the object to do the softmax.
-    Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
-
-    Smem_softmax_sum smem_softmax_lse(reinterpret_cast<float *>(&smem_[Gemm1::SMEM_BYTES]), tidx);
-
-    // Load over the entire sequence length.
-    for (int l = blockIdx.z; l < steps; l += step_stride) {
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z <= 1)) {
-        //     printf("l = %d\n", l);
-        // }
-        if ((begin + l) * Cta_tile_p::M >= binfo.actual_seqlen_q) break;
-
-        // Declare the accumulators for the 1st gemm.
-        fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
-
-        // Do this part of P = Q * K^T.
-        gemm_q_k(acc_p);
-
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //     printf("acc_p=%.6f, %.6f\n", acc_p[0][0].elt(0), acc_p[0][0].elt(1));
-        // }
-
-        uint4 out[Gmem_tile_o::STGS_PER_LOOP];
-        if (!Is_first) { gmem_o_tmp.load(out, 0); }
-
-        // Trigger the load for the next Q values.
-        if (l + step_stride < steps) {
-            gemm_q_k.smem_q.move_to_next_write_buffer();
-            gmem_q.move(step_stride);
-            gmem_q.load();
-        }
-
-        // Load the mask for that iteration.
-        mask.load(begin + l);
-
-        // Convert from the accumulator type to FP32 for Softmax.
-        softmax.unpack_noscale(acc_p);
-
-        // Apply the mask.
-        softmax.apply_mask(mask);
-
-        if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l < step_stride ) {
-            // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
-            __syncthreads();
-        }
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l >= 0))  {
-        //         printf("p_prev_lse=%.6f, %.6f\n", p_prev_lse[0], p_prev_lse[1]);
-        //     }
-        // }
-        // Compute the max.
-        float p_max[Mma_tile_p::MMAS_M * 2];
-        if (!Is_first) {
-            smem_softmax_lse.store_pair(p_prev_lse);
-            // for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi]; }
-            for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi] / params.scale_bmm1f; }
-        }
-
-        // Trigger the load for the next LSE values.
-        if (l + step_stride < steps) {
-            if (!Is_first) {
-                gmem_softmax_lse.load_next(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
-                                           step_stride);
-            }
-        }
-
-        softmax.template reduce_max</*zero_init=*/Is_first>(p_max);
-
-        // if ((threadIdx.x == 0) && (l == 38)) {
-        //     printf("loop_step_idx %d, p_max = %.6f, %.6f., p_prev_lse = %.6f, %.6f\n", loop_step_idx, p_max[0], p_max[1], Is_first ? -10000.f : p_prev_lse[0], Is_first ? -10000.f : p_prev_lse[1]);
-        // }
-
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("after reduce_max=%.6f, %.6f\n", softmax.elt_[0][0], softmax.elt_[0][1]);
-        //     }
-        // }
-
-        // Compute the exponential value.
-        // softmax.apply_exp(p_max);
-        softmax.scale_apply_exp(p_max, params.scale_bmm1f);
-
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("after apply_exp=%.6f, %.6f\n", softmax.elt_[0][0], softmax.elt_[0][1]);
-        //     }
-        // }
-
-        // Compute the sum.
-        float p_sum[Mma_tile_p::MMAS_M * 2];
-        // if (!Is_first) {
-        //     int warp = tidx / Cta_tile_p::THREADS_PER_WARP;
-        //     int lane = tidx % Cta_tile_p::THREADS_PER_WARP;
-        //     for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
-        //         p_sum[mi] = ((warp == 0) && (lane % 4 == 0)) ? expf(p_prev_lse[mi] - p_max[mi]) : 0;
-        //     }
-        // }
-        // softmax.reduce_sum(p_sum);
-        softmax.reduce_sum_before_sync_(p_sum);
-        // softmax.template reduce_sum_before_sync_</*zero_init=*/Is_first>(p_sum);
-
-        // float p_sum_log[Mma_tile_p::MMAS_M * 2];
-        // for (int mi = 0; mi  < Mma_tile_p::MMAS_M * 2; ++mi) {
-        //     float sum = p_sum[mi];
-        //     // p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi] + __logf(sum);
-        //     constexpr float kLog2e = M_LOG2E;
-        //     p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi] * kLog2e + __log2f(sum);
-        // }
-        // // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_sum));
-        // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_sum_log));
-        // gmem_softmax_lse.move();
-
-        // // Finalize softmax on the accumulators of P^T.
-        // softmax.scale(p_sum);
-
-        constexpr bool encode_dropout_in_sign_bit = Return_softmax;
-        if (Is_dropout) {
-            // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph, params.p_dropout_in_uint);
-            // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph, ph1, params.p_dropout_in_uint);
-            // softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph, ph1, params.p_dropout_in_uint16_t);
-            unsigned int warp_idx = threadIdx.x / 32;
-            // TODO: this should change after we rearrange the warps (e.g. cutlass branch)
-            unsigned int block_col_idx = loop_step_idx * Cta_tile_p::N / 16 + warp_idx;
-            // We want to use actual_seqlen_k, not seqlen_k, since seqlen_k could be rounded
-            // differently in the fwd and bwd pass. E.g., for d=128 on A100, fwd rounds seqlen_k
-            // to multiples of 256 while bwd rounds seqlen_k to multiples of 128.
-            unsigned long long philox_subsequence = (begin + l) * (binfo.actual_seqlen_k / 16) + block_col_idx;
-            softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph, params.p_dropout_in_uint16_t, philox_subsequence);
-        }
-
-        using Frag_p = fmha::Fragment_a<fmha::Row>;
-        Frag_p frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
-        static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M);
-        static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N);
-        softmax.template pack<elem_type>(frag_p);
-        if (Return_softmax) {
-            gmem_s.store(frag_p, mask);
-            gmem_s.move(step_stride);
-        }
-
-        // Commit the values for Q into shared memory.
-        if (l + step_stride < steps) {
-            gmem_q.commit(gemm_q_k.smem_q);
-        }
-
-        if (Is_dropout && encode_dropout_in_sign_bit) {
-            #pragma unroll
-            for( int ki = 0; ki < Mma_tile_o::MMAS_K; ki++ ) {
-                #pragma unroll
-                for( int mi = 0; mi < Mma_tile_o::MMAS_M; mi++ ) {
-                    frag_p[ki][mi].template hrelu_<elem_type>();
-                }
-            }
-        }
-
-        // Declare the accumulators for the 2nd gemm.
-        fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
-        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);
-
-        // Do this part of O = P^T * V^T.
-        #pragma unroll
-        for( int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki ) {
-            fmha::gemm_cl<elem_type>(acc_o, frag_p[ki], frag_v[ki]);
-            // if ((threadIdx.x == 4) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-            //     float2 tmp_p = __half22float2(reinterpret_cast<__half2 &>(frag_p[ki]));
-            //     float2 tmp_v = __half22float2(reinterpret_cast<__half2 &>(frag_v[ki]));
-            //     printf("Per warp, threadIdx.x = %d, frag_p = %.6f, %.6f, frag_v = %.6f, %.6f, acc_o=%.6f\n", threadIdx.x, tmp_p.x, tmp_p.y, tmp_v.x, tmp_v.y, acc_o[0][0].elt(0));
-            // }
-        }
-
-        // if ((threadIdx.x % 32 == 16) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //     printf("Per warp, threadIdx.x = %d, acc_o=%.6f\n", threadIdx.x, acc_o[0][2].elt(0));
-        // }
-
-        // The mapping from tidx to rows changes between the softmax and the
-        // O-reduction. So we recalculate the max.
-        float p_max_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
-        int rows[Gmem_tile_o::STGS_PER_LOOP];
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            rows[jj] = tidx / Gmem_tile_o::THREADS_PER_ROW + jj * Gmem_tile_o::ROWS_PER_STG;
-        }
-        softmax.reduce_max_after_sync_(p_max_o, rows);
-        static_assert(Mma_tile_o::MMAS_M == 1);
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            p_max_o[jj][0] *= params.scale_bmm1f;
-        }
-        float p_prev_scale_o[Gmem_tile_o::STGS_PER_LOOP];
-        if (!Is_first) {
-            smem_softmax_lse.load(p_prev_scale_o, rows);
-        }
-        // if (!Is_first) {
-        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-        //         printf("p_prev_scale_o=%.6f\n", p_prev_scale_o[0]);
-        //     }
-        // }
-
-        static_assert(Gmem_tile_o::LOOPS == 1);
-
-        // Swizzle the elements and do the final reduction.
-        smem_o.store(acc_o, 0);
-
-        // Make sure the data is in shared memory.
-        __syncthreads();
-
-        static_assert(Mma_tile_o::MMAS_M == 1);
-        float p_sum_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
-        softmax.reduce_sum_after_sync_(p_sum_o, rows);
-        if (!Is_first) {
-            for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-                p_prev_scale_o[jj] = expf(p_prev_scale_o[jj] - p_max_o[jj][0]);
-                p_sum_o[jj][0] += p_prev_scale_o[jj];
-            }
-        }
-
-        float p_sum_log[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
-        #pragma unroll
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            float sum = p_sum_o[jj][0];
-            p_sum_log[jj][0] = (sum == 0.f || sum != sum) ? -INFINITY : p_max_o[jj][0] + __logf(sum);
-            // if (sum == 0.f || sum != sum) {
-            //     printf("loop_step_idx = %d, l = %d, tidx = %d, sum = %.6f, p_max_o = %.6f\n", loop_step_idx, l, tidx, sum, p_max_o[jj][0]);
-            // }
-            // if (Is_first) {
-            //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
-            //         printf("p_sum_log=%.6f\n", p_sum_log[jj][0]);
-            //     }
-            // }
-            if (tidx % Gmem_tile_o::THREADS_PER_ROW == 0) {
-                gmem_softmax_lse.store_row(
-                    reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]), rows[jj]);
-            }
-        }
-        gmem_softmax_lse.move(step_stride);
-
-        // Load from shared memory.
-        if (!Is_first) {
-            for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-                out[jj] = fmha::fmul4(out[jj], p_prev_scale_o[jj]);
-            }
-        }
-        smem_o.template load</*zero_init=*/Is_first>(out);
-
-        const bool is_final_write =
-            Is_last
-            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
-            || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
-        #pragma unroll
-        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-            float sum = p_sum_o[jj][0];
-            float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-            if (Is_dropout && is_final_write) {
-                inv_sum *= params.rp_dropout;
-            }
-            out[jj] = fmha::fmul4(out[jj], inv_sum);
-        }
-
-        // if (Is_dropout && Is_last) {
-        //     for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
-        //         out[jj] = fmha::fmul4(out[jj], params.rp_dropout);
-        //     }
-        // }
-
-        // Output the values.
-        if (is_final_write) {
-            gmem_o.template store<elem_type>(out, 0);
-            gmem_o.move(step_stride);
-        } else {
-            gmem_o_tmp.store(out, 0);
-        }
-
-        // Move to the next part of the output.
-        if (!(Is_first && Is_last)) { gmem_o_tmp.move(step_stride); }
-        gemm_q_k.reload_k();
-
-        // Make sure we are reading from the correct buffer.
-        gemm_q_k.smem_q.move_to_next_read_buffer();
-        // Trigger the load from shared memory for the next series of Q values.
-        if (l + step_stride < steps) {
-            gemm_q_k.reload_q();
-        }
-    }  // Outer loop over the sequence length.
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, typename Params>
-inline __device__ void device_1xN_loop(const Params &params) {
-
-    // The block index for the batch.
-    const int bidb = blockIdx.x;
-    // The block index for the head.
-    const int bidh = blockIdx.y;
-    // The block index.
-    const int bidx = gridDim.x * bidh + bidb;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    // We want the fwd and bwd to generate the same dropout pattern (RNG), without restricting
-    // them to have the same number of threads or have to traverse the attention matrix
-    // in the same order.
-    // In the Philox RNG, we use the offset to store the batch, head, and the lane id
-    // (within a warp). We use the subsequence to store the location of the 16 x 16 blocks within
-    // the attention matrix. This way, as long as we have the batch, head, and the location of
-    // the 16 x 16 block within the attention matrix, we can generate the exact same dropout pattern.
-    auto seeds = at::cuda::philox::unpack(params.philox_args);
-    if (bidx == 0 && tidx == 0) {
-        params.rng_state[0] = std::get<0>(seeds);
-        params.rng_state[1] = std::get<1>(seeds);
-    }
-    Philox ph(std::get<0>(seeds), 0, std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32);
-    constexpr int M = Kernel_traits::Cta_tile_p::M;
-    const int STEPS = (params.seqlen_q + M - 1) / M;
-
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-    if (params.seqlen_k == blocksize_c) {
-        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, true>(params, bidb, bidh, STEPS, ph, 0);
-    } else {
-        const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
-        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, false>(params, bidb, bidh, STEPS, ph, 0);
-        for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
-            fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, false>(params, bidb, bidh, STEPS, ph, loop_step_idx);
-        }
-        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, true>(params, bidb, bidh, STEPS, ph, max_loop_steps - 1);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace fmha
-
--- a/csrc/flash_attn/src/fmha_fwd_hdim128.cu
+++ b/csrc/flash_attn/src/fmha_fwd_hdim128.cu
-// Copyright (c) 2022, Tri Dao.
-
-// Splitting the different head dimensions to different files to speed up compilation.
-
-#include "fmha_fwd_launch_template.h"
-
-void run_fmha_fwd_hdim128(Launch_params<FMHA_fprop_params> &launch_params) {
-    FP16_SWITCH(launch_params.params.is_bf16, ([&] {
-        using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
-        run_fmha_fwd_loop<Kernel_traits>(launch_params);
-    }));
-}
\ No newline at end of file
--- a/csrc/flash_attn/src/fmha_fwd_hdim32.cu
+++ b/csrc/flash_attn/src/fmha_fwd_hdim32.cu
-// Copyright (c) 2022, Tri Dao.
-
-// Splitting the different head dimensions to different files to speed up compilation.
-
-#include "fmha_fwd_launch_template.h"
-
-void run_fmha_fwd_hdim32(Launch_params<FMHA_fprop_params> &launch_params) {
-    FP16_SWITCH(launch_params.params.is_bf16, ([&] {
-        if (launch_params.params.seqlen_k == 128) {
-            using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 4, 0x08u, elem_type>;
-            run_fmha_fwd_loop<Kernel_traits>(launch_params);
-        } else if (launch_params.params.seqlen_k >= 256) {
-            using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u, elem_type>;
-            run_fmha_fwd_loop<Kernel_traits>(launch_params);
-        }
-    }));
-}
\ No newline at end of file
--- a/csrc/flash_attn/src/fmha_fwd_hdim64.cu
+++ b/csrc/flash_attn/src/fmha_fwd_hdim64.cu
-// Copyright (c) 2022, Tri Dao.
-
-// Splitting the different head dimensions to different files to speed up compilation.
-
-#include "fmha_fwd_launch_template.h"
-
-void run_fmha_fwd_hdim64(Launch_params<FMHA_fprop_params> &launch_params) {
-    FP16_SWITCH(launch_params.params.is_bf16, ([&] {
-        if (launch_params.params.seqlen_k == 128) {
-            using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u, elem_type>;
-            run_fmha_fwd_loop<Kernel_traits>(launch_params);
-        } else if (launch_params.params.seqlen_k >= 256) {
-            using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u, elem_type>;
-            run_fmha_fwd_loop<Kernel_traits>(launch_params);
-        }
-    }));
-}
--- a/csrc/flash_attn/src/fmha_fwd_launch_template.h
+++ b/csrc/flash_attn/src/fmha_fwd_launch_template.h
-// Copyright (c) 2022, Tri Dao.
-
-#pragma once
-
-#include <vector>
-
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-
-#include "static_switch.h"
-#include "fmha.h"
-#include "fmha_fprop_kernel_1xN.h"
-
-// Find the number of splits that maximizes the occupancy. For example, if we have
-// batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency = 0.89) is
-// better than having 3 splits (efficiency = 0.67). However, we also don't want too many
-// splits as that would incur more HBM reads/writes.
-// So we find the best efficiency, then find the smallest number of splits that gets 95%
-// of the best efficiency.
-// [2022-11-25] TD: Mark this as "inline" otherwise we get "multiple definition" error.
-inline int num_splits_heuristic_fwd(int batch_nheads, int num_SMs, int ctas_per_sm, int max_splits) {
-    float max_efficiency = 0.f;
-    std::vector<float> efficiency;
-    efficiency.reserve(max_splits);
-    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
-        float n_waves = float(batch_nheads * num_splits) / (num_SMs * ctas_per_sm);
-        float eff = n_waves / ceil(n_waves);
-        // printf("num_splits = %d, eff = %f\n", num_splits, eff);
-        if (eff > max_efficiency) { max_efficiency = eff; }
-        efficiency.push_back(eff);
-    }
-    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
-        if (efficiency[num_splits - 1] > 0.95 * max_efficiency) {
-            // printf("num_splits chosen = %d\n", num_splits);
-            return num_splits;
-        }
-    }
-    return 1;
-}
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
-__global__ void fmha_fwd_loop_kernel(FMHA_fprop_params params) {
-    fmha::device_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
-}
-
-template<typename Kernel_traits>
-void run_fmha_fwd_loop(Launch_params<FMHA_fprop_params> &launch_params) {
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-    const int loop_steps = (launch_params.params.seqlen_k + blocksize_c - 1) / blocksize_c;
-
-    constexpr int smem_size_softmax_lse = Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
-    // Don't need smem_size_softmax_lse if we're not looping
-    const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
-        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
-
-    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
-    // https://github.com/kokkos/kokkos-kernels/issues/349
-    // https://github.com/HazyResearch/flash-attention/issues/21
-    BOOL_SWITCH(launch_params.is_dropout, IsDropoutConst, ([&] {
-        auto kernel = launch_params.params.is_causal
-            ? (launch_params.return_softmax
-               ? &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, true, true>
-               : &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, true, false>)
-            : (launch_params.return_softmax
-               ? &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, false, true>
-               : &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, false, false>);
-        if( smem_size >= 48 * 1024 ) {
-            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
-                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-        }
-        // Automatically set num_splits to maximize occupancy
-        if (launch_params.params.num_splits <= 0) {
-            int ctas_per_sm;
-            cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                &ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size);
-            auto dprops = at::cuda::getCurrentDeviceProperties();
-            // printf("CTAS_PER_SM = %d, nSMs = %d\n", ctas_per_sm, dprops->multiProcessorCount);
-            constexpr int M = Kernel_traits::Cta_tile_p::M;
-            launch_params.params.num_splits = num_splits_heuristic_fwd(
-                launch_params.params.b * launch_params.params.h, dprops->multiProcessorCount,
-                ctas_per_sm,
-                /*max_splits=*/std::min(30, (launch_params.params.seqlen_q + M - 1 / M))
-            );
-        }
-        // printf("smem_size = %d\n", smem_size);
-        dim3 grid(launch_params.params.b, launch_params.params.h, launch_params.params.num_splits);
-        kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
-            launch_params.params);
-        FMHA_CHECK_CUDA(cudaPeekAtLastError());
-    }));
-}
--- a/csrc/flash_attn/src/fmha_kernel.h
+++ b/csrc/flash_attn/src/fmha_kernel.h
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <philox.cuh>
-
-#include <fmha.h>
-#include <fmha/utils.h>
-#include <fmha/smem_tile.h>
-#include <fmha/gmem_tile.h>
-#include <fmha/mask.h>
-#include <fmha/softmax.h>
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<int THREADS_PER_CTA>
-struct BlockInfoPadded {
-
-    template<typename Params>
-    __device__ BlockInfoPadded(const Params &params,
-                               const int bidb,
-                               const int bidh,
-                               const int tidx)
-        : bidb(bidb), bidh(bidh), h(params.h) {
-
-        // The block index.
-        sum_s_k = params.cu_seqlens_k[bidb];
-        actual_seqlen_k = params.cu_seqlens_k[bidb + 1] - sum_s_k;
-        sum_s_q = params.cu_seqlens_q[bidb];
-        actual_seqlen_q = params.cu_seqlens_q[bidb + 1] - sum_s_q;
-
-        tidx_global = (bidb * params.h + bidh) * THREADS_PER_CTA + tidx;
-    }
-
-    __device__ bool stop_early(const int start_col = 0) const {
-        return actual_seqlen_k <= start_col;
-    }
-
-    int actual_seqlen_q;
-    int actual_seqlen_k;
-    int sum_s_q;
-    int sum_s_k;
-    int bidh;
-    int bidb;
-    int tidx_global;
-    int h;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace fmha
--- a/csrc/flash_attn/src/fmha_utils.h
+++ b/csrc/flash_attn/src/fmha_utils.h
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda_runtime_api.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define FMHA_CHECK_CUDA( call )                                                                    \
-    do {                                                                                           \
-        cudaError_t status_ = call;                                                                \
-        if( status_ != cudaSuccess ) {                                                             \
-            fprintf( stderr,                                                                       \
-                     "CUDA error (%s:%d): %s\n",                                                   \
-                     __FILE__,                                                                     \
-                     __LINE__,                                                                     \
-                     cudaGetErrorString( status_ ) );                                              \
-            exit( 1 );                                                                             \
-        }                                                                                          \
-    } while( 0 )
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-enum Data_type { DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_FP32, DATA_TYPE_INT32, DATA_TYPE_INT8 };
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static inline void set_alpha( uint32_t &alpha, float norm, Data_type dtype ) {
-    if( dtype == DATA_TYPE_FP16 ) {
-        half x = __float2half_rn( norm );
-        uint16_t h = reinterpret_cast<const uint16_t &>( x );
-        ushort2 h2 = { h, h };
-        alpha = reinterpret_cast<const uint32_t &>( h2 );
-    } else if( dtype == DATA_TYPE_BF16 ) {
-        __nv_bfloat16 x = __float2bfloat16( norm );
-        uint16_t h = reinterpret_cast<const uint16_t &>( x );
-        ushort2 h2 = { h, h };
-        alpha = reinterpret_cast<const uint32_t &>( h2 );
-    } else if( dtype == DATA_TYPE_FP32 ) {
-        alpha = reinterpret_cast<const uint32_t &>( norm );
-    } else if( dtype == DATA_TYPE_INT32 ) {
-        int32_t inorm = static_cast<int32_t>( norm );
-        alpha = reinterpret_cast<const uint32_t &>( inorm );
-    } else {
-        assert( false );
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static inline size_t get_size_in_bytes( size_t n, Data_type dtype ) {
-    switch( dtype ) {
-    case DATA_TYPE_FP32:
-        return n * 4;
-    case DATA_TYPE_FP16:
-        return n * 2;
-    case DATA_TYPE_BF16:
-        return n * 2;
-    case DATA_TYPE_INT32:
-        return n * 4;
-    case DATA_TYPE_INT8:
-        return n;
-    default:
-        assert( false );
-        return 0;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
--- a/csrc/flash_attn/src/kernel_traits.h
+++ b/csrc/flash_attn/src/kernel_traits.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/copy.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/layout.h"
+#include <cutlass/numeric_types.h>
+
+using namespace cute;
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type=cutlass::half_t>
+struct Flash_kernel_traits {
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using Element = elem_type;
+    static constexpr bool Has_cp_async = true;
+#else
+    using Element = cutlass::half_t;
+    static constexpr bool Has_cp_async = false;
+#endif
+
+    using ElementAccum = float;
+    using index_t = uint32_t;
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using MMA_Atom_Arch = std::conditional_t<
+        std::is_same_v<elem_type, cutlass::half_t>,
+        MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+        MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>
+    >;
+    using ValLayoutMNK = Layout<Shape<_1, _2, _1>>;
+#else
+    using MMA_Atom_Arch = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
+    using ValLayoutMNK = Layout<Shape<_1, _2, _2>>;
+#endif
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 750
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, elem_type>;
+    using SmemCopyAtomTransposed = Copy_Atom<SM75_U16x8_LDSM_T, elem_type>;
+#else
+    using SmemCopyAtom = Copy_Atom<DefaultCopy, elem_type>;
+    using SmemCopyAtomTransposed = Copy_Atom<DefaultCopy, elem_type>;
+#endif
+};
+
+// If Share_Q_K_smem is true, that forces Is_Q_in_regs to be true
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, bool Is_Q_in_regs_=false, bool Share_Q_K_smem_=false, typename elem_type=cutlass::half_t,
+         typename Base=Flash_kernel_traits<kHeadDim_, kBlockM_, kBlockN_, kNWarps_, elem_type> >
+struct Flash_fwd_kernel_traits : public Base {
+    using Element = typename Base::Element;
+    using ElementAccum = typename Base::ElementAccum;
+    using index_t = typename Base::index_t;
+    static constexpr bool Has_cp_async = Base::Has_cp_async;
+    using SmemCopyAtom = typename Base::SmemCopyAtom;
+    using SmemCopyAtomTransposed = typename Base::SmemCopyAtomTransposed;
+
+    static constexpr bool Share_Q_K_smem = Share_Q_K_smem_;
+    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_ || Share_Q_K_smem;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * 32;
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
+    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
+
+    using TiledMma = TiledMMA<
+        typename Base::MMA_Atom_Arch,
+        Layout<Shape<Int<kNWarps>,_1,_1>>,  // 4x1x1 or 8x1x1 thread group
+        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+
+    using SmemLayoutAtomQ = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    // This has to be kBlockKSmem, using kHeadDim gives wrong results for d=128
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutQ = decltype(tile_to_shape(
+        SmemLayoutAtomQ{},
+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
+
+    using SmemLayoutKV = decltype(tile_to_shape(
+        SmemLayoutAtomQ{},
+        Shape<Int<kBlockN>, Int<kHeadDim>>{}));
+
+    using SmemLayoutAtomVtransposed = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
+                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
+                           Stride<_1, Int<kBlockKSmem>>>{}));
+    using SmemLayoutVtransposed = decltype(tile_to_shape(
+        SmemLayoutAtomVtransposed{},
+        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
+    // Maybe the VtransposeNoSwizzle just needs to have the right shape
+    // And the strides don't matter?
+    using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());
+
+    using SmemLayoutAtomO = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<Int<8>, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutO = decltype(tile_to_shape(
+        SmemLayoutAtomO{},
+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
+    using SmemCopyAtomO = Copy_Atom<DefaultCopy, elem_type>;
+
+    static constexpr int kSmemQCount = size(SmemLayoutQ{});
+    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
+    static constexpr int kSmemQSize = kSmemQCount * sizeof(Element);
+    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
+    static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize;
+
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
+    // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts.
+    // For example, for d=128, smem is split into 2 "pages", each page takes care of columns
+    // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem,
+    // thread 0 - 7 will write to the first page and thread 8 - 15 will write to the second page,
+    // to the same banks.
+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+
+    // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading
+    // from the same address by the same threadblock. This is slightly faster.
+    using Gmem_copy_struct = std::conditional_t<
+        Has_cp_async,
+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
+        DefaultCopy
+    >;
+    using GmemTiledCopyQKV = decltype(
+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
+    using GmemTiledCopyO = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
+    static constexpr int kGmemThreadsPerRowP = kBlockN / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRowP == 0, "kNThreads must be a multiple of kGmemThreadsPerRowP");
+    using GmemLayoutAtomP = Layout<Shape <Int<kNThreads / kGmemThreadsPerRowP>, Int<kGmemThreadsPerRowP>>,
+                                   Stride<Int<kGmemThreadsPerRowP>, _1>>;
+
+    using GmemTiledCopyP = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtomP{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
+
+};
+
+// Is_V_in_regs is an option to reduce smem usage, but will increase register pressue.
+// No_double_buffer is another option to reduce smem usage, but will slow things down.
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_,
+         int AtomLayoutMSdP_=1, int AtomLayoutNdKV=2, int AtomLayoutMdQ=2,
+         bool Is_V_in_regs_=false, bool No_double_buffer_=false, typename elem_type=cutlass::half_t,
+         typename Base=Flash_kernel_traits<kHeadDim_, kBlockM_, kBlockN_, kNWarps_, elem_type> >
+struct Flash_bwd_kernel_traits : public Base {
+    using Element = typename Base::Element;
+    using ElementAccum = typename Base::ElementAccum;
+    using index_t = typename Base::index_t;
+    static constexpr bool Has_cp_async = Base::Has_cp_async;
+    using SmemCopyAtom = typename Base::SmemCopyAtom;
+    using SmemCopyAtomTransposed = typename Base::SmemCopyAtomTransposed;
+
+    static constexpr bool Is_V_in_regs = Is_V_in_regs_;
+    static constexpr bool No_double_buffer = No_double_buffer_;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * 32;
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
+    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
+
+    static constexpr int AtomLayoutMSdP = AtomLayoutMSdP_;
+    static_assert(kNWarps % AtomLayoutMSdP == 0);
+    static_assert(kNWarps % AtomLayoutNdKV == 0);
+    static_assert(kNWarps % AtomLayoutMdQ == 0);
+
+    using TiledMmaSdP = TiledMMA<
+        typename Base::MMA_Atom_Arch,
+        Layout<Shape<Int<AtomLayoutMSdP>, Int<kNWarps / AtomLayoutMSdP>, _1>>,
+        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+
+    using TiledMmadKV = TiledMMA<
+        typename Base::MMA_Atom_Arch,
+        Layout<Shape<Int<AtomLayoutNdKV>, Int<kNWarps / AtomLayoutNdKV>, _1>>,
+        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+
+    using TiledMmadQ = TiledMMA<
+        typename Base::MMA_Atom_Arch,
+        Layout<Shape<Int<AtomLayoutMdQ>, Int<kNWarps / AtomLayoutMdQ>, _1>>,  // 2x4x1 or 4x2x1 thread group
+        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+
+    using SmemLayoutAtomQdO = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutQdO = decltype(tile_to_shape(
+        SmemLayoutAtomQdO{},
+        make_shape(Int<kBlockM>{}, Int<kHeadDim>{})));
+
+    using SmemLayoutAtomKV = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<Int<kBlockM / kNWarps>, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutKV = decltype(tile_to_shape(
+        // SmemLayoutAtomQdO{},
+        SmemLayoutAtomKV{},
+        make_shape(Int<kBlockN>{}, Int<kHeadDim>{})));
+
+    using SmemLayoutAtomKtransposed = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
+                           Stride<_1, Int<kBlockKSmem>>>{}));
+    using SmemLayoutKtransposed = decltype(tile_to_shape(
+        SmemLayoutAtomKtransposed{},
+        make_shape(Int<kHeadDim>{}, Int<kBlockN>{})));
+    // Maybe the KtransposeNoSwizzle just needs to have the right shape
+    // And the strides don't matter?
+    using SmemLayoutKtransposedNoSwizzle = decltype(SmemLayoutKtransposed{}.layout_fn());
+
+    // TODO: generalize to other values of kBlockN
+    // TODO: what should be the Swizzle here? 3 is faster than 1, and 1 is faster than 2
+    // static constexpr int kPBlockN = kBlockN;
+    static_assert(kBlockN >= 64);
+    // TD [2023-03-19]: Idk why kPBlockN = 16 and kSwizzlePdS=3 is the fastest.
+    static constexpr int kPBlockN = 64;
+    static_assert(kPBlockN == 16 || kPBlockN == 32 || kPBlockN == 64);
+    // static constexpr int kSwizzlePdS = kPBlockN == 16 ? 1 : (kPBlockN == 32 ? 2 : 3);
+    static constexpr int kSwizzlePdS = 3;
+    using SmemLayoutAtomPdS = decltype(
+        composition(Swizzle<kSwizzlePdS, 3, 3>{},
+                    Layout<Shape<Int<kBlockM>, Int<kPBlockN>>,
+                           Stride<Int<kPBlockN>, _1>>{}));
+    using SmemLayoutPdS = decltype(tile_to_shape(
+        SmemLayoutAtomPdS{},
+        make_shape(Int<kBlockM>{}, Int<kBlockN>{})));
+    using SmemLayoutAtomPdStransposed = decltype(
+        composition(Swizzle<kSwizzlePdS, 3, 3>{},
+                    Layout<Shape<Int<kPBlockN>, Int<kBlockM>>,
+                           Stride<_1, Int<kPBlockN>>>{}));
+    using SmemLayoutPdStransposed = decltype(tile_to_shape(
+        SmemLayoutAtomPdStransposed{},
+        make_shape(Int<kBlockN>{}, Int<kBlockM>{})));
+    using SmemLayoutPdStransposedNoSwizzle = decltype(SmemLayoutPdStransposed{}.layout_fn());
+    using SmemCopyAtomPdS = Copy_Atom<DefaultCopy, elem_type>;
+
+    using SmemLayoutAtomQdOtransposed = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockM>>,
+                           Stride<_1, Int<kBlockKSmem>>>{}));
+    using SmemLayoutQdOtransposed = decltype(tile_to_shape(
+        SmemLayoutAtomQdOtransposed{},
+        make_shape(Int<kHeadDim>{}, Int<kBlockM>{})));
+    using SmemLayoutQdOtransposedNoSwizzle = decltype(SmemLayoutQdOtransposed{}.layout_fn());
+
+    using SmemLayoutAtomdKV = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutdKV = decltype(tile_to_shape(
+        SmemLayoutAtomdKV{},
+        make_shape(Int<kBlockN>{}, Int<kHeadDim>{})));
+    using SmemCopyAtomdKV = Copy_Atom<DefaultCopy, elem_type>;
+
+    using SmemLayoutAtomdQ = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutdQ = decltype(tile_to_shape(
+        SmemLayoutAtomdQ{},
+        make_shape(Int<kBlockM>{}, Int<kHeadDim>{})));
+    using SmemCopyAtomdQ = Copy_Atom<DefaultCopy, elem_type>;
+
+    static constexpr int kSmemQdOCount = size(SmemLayoutQdO{}) * (No_double_buffer ? 2 : 3);  // Double buffer for sQ
+    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
+    static constexpr int kSmemdSCount = size(SmemLayoutPdS{});
+    static constexpr int kSmemPCount = size(SmemLayoutPdS{});
+    static constexpr int kSmemdQCount = size(SmemLayoutdQ{});
+    static constexpr int kSmemdPsumCount = kBlockM;
+    static constexpr int kSmemQdOSize = kSmemQdOCount * sizeof(Element);
+    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
+    static constexpr int kSmemdSSize = kSmemdSCount * sizeof(Element);
+    static constexpr int kSmemPSize = kSmemPCount * sizeof(Element);
+    static constexpr int kSmemdQSize = kSmemdQCount * sizeof(Element);
+    static constexpr int kSmemdPsumSize = kSmemdPsumCount * sizeof(ElementAccum);
+    static constexpr int kSmemSize = kSmemQdOSize
+        + (!Is_V_in_regs
+           ? kSmemKVSize + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize)
+           : std::max(kSmemKVSize, kSmemKVSize / 2 + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize)));
+    static constexpr int kSmemSize1colblock = kSmemQdOSize
+        + (!Is_V_in_regs
+           ? kSmemKVSize + kSmemdSSize + kSmemPSize
+           : std::max(kSmemKVSize, kSmemKVSize / 2 + kSmemdSSize + kSmemPSize));
+    static constexpr int kSmemSize1rowblock = kSmemQdOSize / 3 * 2 + kSmemKVSize / 2 * 3
+        + kSmemdSSize + kSmemPSize;
+
+
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
+    // Using kBlockKSmem instead of kHeadDim here to avoid bank conflicts, but doesn't seem
+    // to affect speed in practice.
+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+
+    // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading
+    // from the same address by the same threadblock. This is slightly faster.
+    using Gmem_copy_struct = std::conditional_t<
+        Has_cp_async,
+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
+        DefaultCopy
+    >;
+    using GmemTiledCopyQKV = decltype(
+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
+    using GmemTiledCopydO = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemTiledCopydKV = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemTiledCopydQ = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemLayoutAtomdQaccum = std::conditional_t<
+        kBlockKSmem == 32,
+        Layout<Shape <_32, _8>,  // Thread layout, 8 threads per row
+               Stride< _8, _1>>,
+        Layout<Shape <_16, _16>,  // Thread layout, 16 threads per row
+               Stride< _16, _1>>
+    >;
+    using GmemTiledCopydQaccum = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomdQaccum{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
+
+    using GmemTiledCopydQaccumAtomicAdd = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        Layout<Shape <_8, _32>,  // Thread layout, 8 threads per row
+                               Stride<_32, _1>>{},
+                        Layout<Shape < _1, _1>>{}));  // Val layout, 1 val per store
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/flash_attn/src/kernel_traits_sm90.h
+++ b/csrc/flash_attn/src/kernel_traits_sm90.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/copy.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/layout.h"
+#include <cutlass/numeric_types.h>
+
+using namespace cute;
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type=cutlass::half_t>
+struct Flash_kernel_traits_sm90 {
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using Element = elem_type;
+    static constexpr bool Has_cp_async = true;
+#else
+    using Element = cutlass::half_t;
+    static constexpr bool Has_cp_async = false;
+#endif
+
+    using ElementAccum = float;
+    using index_t = uint32_t;
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using MMA_Atom_Arch = std::conditional_t<
+        std::is_same_v<elem_type, cutlass::half_t>,
+        MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+        MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>
+    >;
+    using ValLayoutMNK = Layout<Shape<_1, _2, _1>>;
+#else
+    using MMA_Atom_Arch = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
+    using ValLayoutMNK = Layout<Shape<_1, _2, _2>>;
+#endif
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 750
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, elem_type>;
+    using SmemCopyAtomTransposed = Copy_Atom<SM75_U16x8_LDSM_T, elem_type>;
+#else
+    using SmemCopyAtom = Copy_Atom<DefaultCopy, elem_type>;
+    using SmemCopyAtomTransposed = Copy_Atom<DefaultCopy, elem_type>;
+#endif
+};
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, bool Is_Q_in_regs_=false, bool Share_Q_K_smem_=false, typename elem_type=cutlass::half_t,
+         typename Base=Flash_kernel_traits_sm90<kHeadDim_, kBlockM_, kBlockN_, kNWarps_, elem_type> >
+struct Flash_fwd_kernel_traits : public Base {
+    using Element = typename Base::Element;
+    using ElementAccum = typename Base::ElementAccum;
+    using index_t = typename Base::index_t;
+    static constexpr bool Has_cp_async = Base::Has_cp_async;
+    using SmemCopyAtom = typename Base::SmemCopyAtom;
+    using SmemCopyAtomTransposed = typename Base::SmemCopyAtomTransposed;
+
+    static constexpr bool Share_Q_K_smem = Share_Q_K_smem_;
+    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_ || Share_Q_K_smem;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * 32;
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
+    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
+
+    using TiledMma = TiledMMA<
+        typename Base::MMA_Atom_Arch,
+        Layout<Shape<Int<kNWarps>,_1,_1>>,  // 4x1x1 or 8x1x1 thread group
+        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+
+    using SmemLayoutAtomQ = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    // This has to be kBlockKSmem, using kHeadDim gives wrong results for d=128
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutQ = decltype(tile_to_shape(
+        SmemLayoutAtomQ{},
+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
+
+    using SmemLayoutKV = decltype(tile_to_shape(
+        SmemLayoutAtomQ{},
+        Shape<Int<kBlockN>, Int<kHeadDim>>{}));
+
+    using SmemLayoutAtomVtransposed = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
+                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
+                           Stride<_1, Int<kBlockKSmem>>>{}));
+    using SmemLayoutVtransposed = decltype(tile_to_shape(
+        SmemLayoutAtomVtransposed{},
+        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
+    // Maybe the VtransposeNoSwizzle just needs to have the right shape
+    // And the strides don't matter?
+    using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());
+
+    using SmemLayoutAtomO = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<Int<8>, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutO = decltype(tile_to_shape(
+        SmemLayoutAtomO{},
+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
+    using SmemCopyAtomO = Copy_Atom<DefaultCopy, elem_type>;
+
+    static constexpr int kSmemQCount = size(SmemLayoutQ{});
+    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
+    static constexpr int kSmemQSize = kSmemQCount * sizeof(Element);
+    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
+    static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize;
+
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
+    // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts.
+    // For example, for d=128, smem is split into 2 "pages", each page takes care of columns
+    // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem,
+    // thread 0 - 7 will write to the first page and thread 8 - 15 will write to the second page,
+    // to the same banks.
+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+
+    // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading
+    // from the same address by the same threadblock. This is slightly faster.
+    using Gmem_copy_struct = std::conditional_t<
+        Has_cp_async,
+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
+        DefaultCopy
+    >;
+    using GmemTiledCopyQKV = decltype(
+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
+    using GmemTiledCopyO = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
+    static constexpr int kGmemThreadsPerRowP = kBlockN / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRowP == 0, "kNThreads must be a multiple of kGmemThreadsPerRowP");
+    using GmemLayoutAtomP = Layout<Shape <Int<kNThreads / kGmemThreadsPerRowP>, Int<kGmemThreadsPerRowP>>,
+                                   Stride<Int<kGmemThreadsPerRowP>, _1>>;
+
+    using GmemTiledCopyP = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtomP{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/flash_attn/src/philox.cuh
+++ b/csrc/flash_attn/src/philox.cuh
-// Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/csrc/multihead_attn/philox.cuh
-// Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+// Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/8ca3c881db3e3510fcb7725389f6a0633c9b992c/torch/csrc/jit/tensorexpr/cuda_random.h
 #pragma once
 // Philox CUDA.

+namespace flash {
+
+struct ull2 {
+    unsigned long long x;
+    unsigned long long y;
+};
+
+inline __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
+    uint2 *res;
+    unsigned long long tmp;
+    asm ("mul.wide.u32 %0, %1, %2;\n\t"
+          : "=l"(tmp)
+          : "r"(a), "r"(b));
+    res = (uint2*)(&tmp);
+    return *res;
+}
+
+inline __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) {
+    constexpr unsigned long kPhiloxSA = 0xD2511F53;
+    constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
+    uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
+    uint2 res1 = mulhilo32(kPhiloxSB, ctr.z);
+    uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
+    return ret;
+}
+
+inline __device__ uint4 philox(unsigned long long seed,
+                               unsigned long long subsequence,
+                               unsigned long long offset) {
+    constexpr unsigned long kPhilox10A = 0x9E3779B9;
+    constexpr unsigned long kPhilox10B = 0xBB67AE85;
+    uint2 key = reinterpret_cast<uint2&>(seed);
+    uint4 counter;
+    ull2 *tmp = reinterpret_cast<ull2*>(&counter);
+    tmp->x = offset;
+    tmp->y = subsequence;
+    #pragma unroll
+    for (int i = 0; i < 6; i++) {
+        counter = philox_single_round(counter, key);
+        key.x += (kPhilox10A);
+        key.y += (kPhilox10B);
+    }
+    uint4 output = philox_single_round(counter, key);
+    return output;
+}
+
+} // namespace flash
+
 namespace {

 class Philox {
@@ -10,7 +57,10 @@ public:
  __device__ inline Philox(unsigned long long seed,
                           unsigned long long subsequence,
                           unsigned long long offset)
-    : key(reinterpret_cast<const uint2&>(seed)) {
+      : STATE(0)
+      , seed_(seed)
+      , offset_(offset)
+      , key(reinterpret_cast<const uint2&>(seed)) {
    //key.x = (unsigned int)seed;
    //key.y = (unsigned int)(seed >> 32);
    //counter = make_uint4(0, 0, 0, 0);
@@ -19,6 +69,7 @@ public:
    //STATE = 0;
    //incr_n(offset / 4);

+    // key = reinterpret_cast<const uint2&>(seed);
    ull2 * tmp = reinterpret_cast<ull2*>(&counter);
    tmp->x = offset / 4;
    tmp->y = subsequence;
@@ -26,72 +77,64 @@ public:
    //     printf("Philox counter: %d, %d, %d, %d\n", counter.x, counter.y, counter.z, counter.w);
    // }
  }
-
  __device__ inline uint4 operator()() {
-    uint4 counter_ = counter;
-    uint2 key_ = key;
-    // 7-round philox
-    #pragma unroll
-    for (int i = 0; i < 6; i++) {
-      counter_ = single_round(counter_, key_);
-      key_.x += (kPhilox10A);
-      key_.y += (kPhilox10B);
-    }
-    uint4 output = single_round(counter_, key_);
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
-    //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
-    // }
-    incr();
-    return output;
-  }
-
-  __device__ inline uint4 operator()(const unsigned long long subsequence) {
-    uint4 counter_ = counter;
-    ull2 * tmp = reinterpret_cast<ull2*>(&counter_);
-    tmp->y = subsequence;
-    // if ((threadIdx.x % 32 == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("tidx = %d, counter_: %u, %u, %u, %u\n", threadIdx.x, counter_.x, counter_.y, counter_.z, counter_.w);
-    // }
-    uint2 key_ = key;
-    // 7-round philox
-    #pragma unroll
-    for (int i = 0; i < 6; i++) {
-      counter_ = single_round(counter_, key_);
-      key_.x += (kPhilox10A);
-      key_.y += (kPhilox10B);
-    }
-    uint4 output = single_round(counter_, key_);
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
-    //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
-    // }
-    return output;
+    // // if (STATE == 0) {
+    //   uint4 counter_ = counter;
+    //   uint2 key_ = key;
+    //   // 7-round philox
+    //   #pragma unroll
+    //   for (int i = 0; i < 6; i++) {
+    //       counter_ = flash::philox_single_round(counter_, key_);
+    //     key_.x += (kPhilox10A);
+    //     key_.y += (kPhilox10B);
+    //   }
+    //   // output = philox_single_round(counter_, key_);
+    //   uint4 output = flash::philox_single_round(counter_, key_);
+    //   // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //   //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
+    //   //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
+    //   // }
+    //   incr();
+    // // }
+    // // return a float4 directly
+    // // unsigned long ret;
+    // // switch(STATE) {
+    // //  case 0: ret = output.x; break;
+    // //  case 1: ret = output.y; break;
+    // //  case 2: ret = output.z; break;
+    // //  case 3: ret = output.w; break;
+    // //}
+    // // STATE = (STATE + 1) % 4;
+    // return output;
+      return flash::philox(seed_, offset_, offset_);
  }

 private:
+  unsigned long long offset_, seed_;
  struct ull2 {
      uint64_t x;
      uint64_t y;
  };
  uint4 counter;
+  // uint4 output;
  const uint2 key;
+  unsigned int STATE;
+  __device__ inline void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }

-  // __device__ inline void incr_n(unsigned long long n) {
-  //   unsigned int nlo = (unsigned int)(n);
-  //   unsigned int nhi = (unsigned int)(n >> 32);
-  //   counter.x += nlo;
-  //   if (counter.x < nlo)
-  //     nhi++;
-  //   counter.y += nhi;
-  //   if (nhi <= counter.y)
-  //     return;
-  //   if (++counter.z)
-  //     return;
-  //   ++counter.w;
-  // }
-
-  __device__ uint4 incr(uint4 ctr) {
+  __device__ uint4 incr128 (uint4 ctr)
+  {
    uint4 res;
    asm ("add.cc.u32      %0, %4, %8;\n\t"
         "addc.cc.u32     %1, %5, %9;\n\t"
@@ -107,51 +150,16 @@ private:
    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
    //     printf("Counter before: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
    // }
-    counter = incr(counter);
+    counter = incr128(counter);
    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
    //     printf("Counter after: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
    // }
  }

-  // __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
-  //                                   unsigned int *result_high) {
-  //   *result_high = __umulhi(a, b);
-  //   return a * b;
-  // }
-
-  __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
-    uint2 *res;
-    unsigned long long tmp;
-    asm ("mul.wide.u32      %0, %1, %2;\n\t"
-          : "=l"(tmp)
-          : "r"(a), "r"(b));
-    res = (uint2*)(&tmp);
-    return *res;
-  }
-
-  __device__ inline uint4 single_round(const uint4 ctr, const uint2 key) {
-    //unsigned int hi0;
-    //unsigned int hi1;
-    //unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
-    //unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
-    //uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
-    uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
-    uint2 res1 = mulhilo32(kPhiloxSB, ctr.z);
-    uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};  
-    return ret;
-  }
-
  static const unsigned long kPhilox10A = 0x9E3779B9;
  static const unsigned long kPhilox10B = 0xBB67AE85;
-  static const unsigned long kPhiloxSA = 0xD2511F53;
-  static const unsigned long kPhiloxSB = 0xCD9E8D57;
+  // static const unsigned long kPhiloxSA = 0xD2511F53;
+  // static const unsigned long kPhiloxSB = 0xCD9E8D57;
 };

-// Inverse of 2^32.
-constexpr float M_RAN_INVM32 = 2.3283064e-10f;
-__device__ __inline__ float4 uniform4(const uint4 x) {
-  return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,
-                     x.w * M_RAN_INVM32);
-}
-
 } // namespace
--- a/csrc/flash_attn/src/softmax.h
+++ b/csrc/flash_attn/src/softmax.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cmath>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+
+#include "philox.cuh"
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
+__device__ inline void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); mi++) {
+        summary(mi) = zero_init ? tensor(mi, 0) : op(summary(mi), tensor(mi, 0));
+        #pragma unroll
+        for (int ni = 1; ni < size<1>(tensor); ni++) {
+            summary(mi) = op(summary(mi), tensor(mi, ni));
+        }
+    }
+}
+
+template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
+__device__ inline void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layout1> &src, Operator &op) {
+    CUTE_STATIC_ASSERT_V(size(dst) == size(src));
+    #pragma unroll
+    for (int i = 0; i < size(dst); i++){
+        dst(i) = Allreduce<4>::run(src(i), op);
+    }
+}
+
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
+__device__ inline void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
+    thread_reduce_<zero_init>(tensor, summary, op);
+    quad_allreduce_(summary, summary, op);
+}
+
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__device__ inline void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &max){
+    MaxOp<float> max_op;
+    reduce_<zero_init>(tensor, max, max_op);
+}
+
+template<typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__device__ inline void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &sum){
+    SumOp<float> sum_op;
+    reduce_(tensor, sum, sum_op);
+}
+
+// Apply the exp to all the elements.
+template <bool Scale_max=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+inline __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &max, const float scale) {
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        // If max is -inf, then all elements must have been -inf (possibly due to masking).
+        // We don't want (-inf - (-inf)) since that would give NaN.
+        // If we don't have float around M_LOG2E the multiplication is done in fp64.
+        const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+        #pragma unroll
+        for (int ni = 0; ni < size<1>(tensor); ++ni)  {
+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            // max * log_2(e)) This allows the compiler to use the ffma
+            // instruction instead of fadd and fmul separately.
+            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+        }
+    }
+}
+
+// Apply the exp to all the elements.
+template <bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+inline __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> &max, Tensor<Engine1, Layout1> &sum, const float scale) {
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        MaxOp<float> max_op;
+        max(mi) = zero_init ? tensor(mi, 0) : max_op(max(mi), tensor(mi, 0));
+        #pragma unroll
+        for (int ni = 1; ni < size<1>(tensor); ni++) {
+            max(mi) = max_op(max(mi), tensor(mi, ni));
+        }
+        max(mi) = Allreduce<4>::run(max(mi), max_op);
+        // If max is -inf, then all elements must have been -inf (possibly due to masking).
+        // We don't want (-inf - (-inf)) since that would give NaN.
+        const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+        sum(mi) = 0;
+        #pragma unroll
+        for (int ni = 0; ni < size<1>(tensor); ++ni)  {
+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            // max * log_2(e)) This allows the compiler to use the ffma
+            // instruction instead of fadd and fmul separately.
+            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            sum(mi) += tensor(mi, ni);
+        }
+        SumOp<float> sum_op;
+        sum(mi) = Allreduce<4>::run(sum(mi), sum_op);
+    }
+}
+
+template <typename Engine, typename Layout>
+inline __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const uint32_t max_seqlen_k) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const uint32_t lane_id = threadIdx.x % 32;
+    #pragma unroll
+    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+        #pragma unroll
+        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+            const uint32_t col_idx = nj * 8 + j + (lane_id % 4) * 2;
+            if (col_idx >= max_seqlen_k) {
+                // Without the "make_coord" we get wrong results
+                #pragma unroll
+                for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                    tensor(mi, make_coord(j, nj)) = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+template <typename Engine, typename Layout>
+inline __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const uint32_t col_idx_offset_,
+                                         const uint32_t max_seqlen_k, const uint32_t row_idx_offset_,
+                                         const uint32_t warp_row_stride) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const uint32_t lane_id = threadIdx.x % 32;
+    // const uint32_t row_idx_offset = row_idx_offset_ + lane_id / 4;
+    const uint32_t row_idx_offset = row_idx_offset_;
+    const uint32_t col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    #pragma unroll
+    for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+        const uint32_t row_idx_base = row_idx_offset + mi * warp_row_stride;
+        #pragma unroll
+        for (int i = 0; i < size<0, 0>(tensor); ++i) {
+            const uint32_t row_idx = row_idx_base + i * 8;
+            const uint32_t col_idx_limit = std::min(max_seqlen_k, row_idx + 1);
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const uint32_t col_idx_base = col_idx_offset + nj * 8;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const uint32_t col_idx = col_idx_base + j;
+                    if (col_idx >= col_idx_limit) {
+                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    }
+                }
+            }
+            // if (cute::thread0()) {
+            //     printf("mi = %d, i = %d, row_idx = %d, max_seqlen_k = %d\n", mi, i, row_idx, max_seqlen_k);
+            //     print(tensor(make_coord(i, mi), _));
+            //     // print(tensor(_, j + nj * size<1, 0>(tensor)));
+            // }
+        }
+    }
+}
+
+template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+inline __device__ void apply_mask_causal_w_idx(
+    Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol,
+    const uint32_t col_idx_offset_, const uint32_t max_seqlen_k, const uint32_t row_idx_offset_)
+{
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 2, "Only support 2D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(tensor) == size<0>(idx_rowcol));
+    CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        const uint32_t col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset_ + get<0>(idx_rowcol(mi, 0)));
+        #pragma unroll
+        for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
+            if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
+                tensor(mi, ni) = -INFINITY;
+            }
+        }
+        // if (cute::thread0()) {
+        //     printf("ni = %d, j = %d, col_idx = %d, max_seqlen_k = %d\n", ni, j, col_idx, max_seqlen_k);
+        //     print(tensor(_, make_coord(j, ni)));
+        //     // print(tensor(_, j + ni * size<1, 0>(tensor)));
+        // }
+    }
+}
+
+template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
+inline __device__ void apply_dropout(Tensor<Engine, Layout> &tensor, uint8_t p_dropout_in_uint8_t,
+                                     unsigned long long seed, unsigned long long offset,
+                                     uint32_t block_row_start, uint32_t block_col_start,
+                                     uint32_t block_row_stride) {
+    // tensor has shape (8, MMA_M, MMA_N / 2)
+    using T = typename Engine::value_type;
+    auto encode_dropout = [](bool keep, T val) {
+        return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
+    };
+    static_assert(decltype(size<2>(tensor))::value % 2 == 0);
+    const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
+    const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
+    // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
+    #pragma unroll
+    for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
+        uint2 rowcol = make_uint2(block_row_start, block_col_start);
+        #pragma unroll
+        for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
+            // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
+            uint4 random_uint4 = flash::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
+            // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
+            uint8_t (&rnd_8)[16] = reinterpret_cast<uint8_t (&)[16]>(random_uint4);
+            // Special implementation for 16-bit types: we duplicate the threshold to the
+            // low and high 16 bits of a 32-bit value, then use the f16x2 comparison instruction
+            // to get a mask. The low 16 bits of the mask will be either 0xffff or 0x0000,
+            // and the high 16 bits will be either 0xffff or 0x0000, depending on whether
+            // the random value is less than the threshold.
+            // We then do a bit-wise AND between the mask and the original value (in 32-bit).
+            // We're exploiting the fact that floating point comparison is equivalent to integer
+            // comparison, since we're comparing unsigned integers whose top 8-bits are zero.
+            if (!encode_dropout_in_sign_bit
+                && (std::is_same<T, cutlass::half_t>::value || std::is_same<T, cutlass::bfloat16_t>::value)) {
+                uint16_t rnd_16[16];
+                #pragma unroll
+                for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); }
+                uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
+                #pragma unroll
+                for (int j = 0; j < 2; j++) {
+                    Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                    // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
+                    // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    #pragma unroll
+                    for (int i = 0; i < 4; i++) {
+                        uint32_t mask;
+                        asm volatile("set.le.u32.f16x2 %0, %1, %2;\n" : "=r"(mask) : "r"(rnd_32[j * 4 + i]), "r"(p_dropout_8bit_in_uint32_t));
+                        tensor_uint32(i) &= mask;
+                    }
+                    // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                }
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 2; j++) {
+                    #pragma unroll
+                    for (int i = 0; i < 8; i++) {
+                        tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
+                    }
+                    Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                    // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                }
+            }
+            // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+            // //     printf("n = %d, ph  Philox: %u, %u, %u, %u\n", n, rnd_8.x, rnd_8.y, rnd_8.z, rnd_8.w);
+            // // }
+        }
+    }
+}
+
+}  // namespace flash