First release

1fcbe6f0 · Tri Dao · 1fcbe6f0 · 1fcbe6f0 · 1fcbe6f0 · 1fcbe6f0
Commit 1fcbe6f0 authored May 20, 2022 by Tri Dao
20 changed files
--- a/README.md
+++ b/README.md
+Alpha release of FlashAttention.
+To compile:
+```
+cd csrc/stream_attn
+python setup.py install
+```
+Interface: `streaming_attention.py`
+Contact: `trid@stanford.edu`
--- a/bert_padding.py
+++ b/bert_padding.py
+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        ctx.first_axis_dim = input.shape[0]
+        assert input.ndim == 2
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(input, 0, repeat(indices, 'z -> z d', d=input.shape[1]))
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        grad_input = torch.zeros([ctx.first_axis_dim, *grad_output.shape[1:]],
+                                 device=grad_output.device, dtype=grad_output.dtype)
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, 'z -> z d', d=grad_output.shape[1]), grad_output)
+        return grad_input, None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim == 2
+        output = torch.zeros(first_axis_dim, values.shape[1], device=values.device,
+                             dtype=values.dtype)
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, dim)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, dim), where total_nnz = number of tokens in selected in attention_mask.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (index_first_axis(rearrange(hidden_states, 'b s d -> (b s) d'), indices), indices,
+            cu_seqlens, max_seqlen_in_batch)
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, dim), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz)
+    Return:
+        hidden_states: (batch, seqlen, dim)
+    """
+    dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, '(b s) d -> b s d', b=batch)
--- a/csrc/stream_attn/README.md
+++ b/csrc/stream_attn/README.md
+Our implementation uses Apex's
+[FMHA](https://github.com/NVIDIA/apex/tree/master/apex/contrib/csrc/fmha) code
+as a starting point.
+We thank [Young-jun Ko](https://yjk21.github.io/) for the in-depth explanation of his FMHA implementation
+and for his thoughtful answers to our questions about CUDA.
--- a/csrc/stream_attn/fmha_api.cpp
+++ b/csrc/stream_attn/fmha_api.cpp
--- a/csrc/stream_attn/setup.py
+++ b/csrc/stream_attn/setup.py
+# Adapted from https://github.com/NVIDIA/apex/blob/master/setup.py
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
+from setuptools import setup, find_packages
+import subprocess
+import sys
+import warnings
+import os
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    return raw_output, bare_metal_major, bare_metal_minor
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        raise RuntimeError(
+            "Cuda extensions are being compiled with a version of Cuda that does "
+            "not match the version used to compile Pytorch binaries.  "
+            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
+            + "In some cases, a minor-version mismatch will not cause later errors:  "
+            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
+            "You can try commenting out this check (at your own risk)."
+        )
+def raise_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    raise RuntimeError(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+def append_nvcc_threads(nvcc_extra_args):
+    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
+    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
+        return nvcc_extra_args + ["--threads", "4"]
+    return nvcc_extra_args
+if not torch.cuda.is_available():
+    # https://github.com/NVIDIA/apex/issues/486
+    # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
+    # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
+    print(
+        "\nWarning: Torch did not find available GPUs on this system.\n",
+        "If your intention is to cross-compile, this is not an error.\n"
+        "By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n"
+        "Volta (compute capability 7.0), Turing (compute capability 7.5),\n"
+        "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n"
+        "If you wish to cross-compile for a single specific architecture,\n"
+        'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n',
+    )
+    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
+        _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
+        if int(bare_metal_major) == 11:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
+            if int(bare_metal_minor) > 0:
+                os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6"
+        else:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
+print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+TORCH_MAJOR = int(torch.__version__.split(".")[0])
+TORCH_MINOR = int(torch.__version__.split(".")[1])
+cmdclass = {}
+ext_modules = []
+# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
+# See https://github.com/pytorch/pytorch/pull/70650
+generator_flag = []
+torch_dir = torch.__path__[0]
+if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
+    generator_flag = ["-DOLD_GENERATOR_PATH"]
+raise_if_cuda_home_none("--streamattn")
+# Check, if CUDA11 is installed for compute capability 8.0
+cc_flag = []
+_, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
+if int(bare_metal_major) < 11:
+    raise RuntimeError("--streamattn only supported on SM80+")
+cc_flag.append("-gencode")
+cc_flag.append("arch=compute_80,code=sm_80")
+ext_modules.append(
+    CUDAExtension(
+        name="stream_attn_cuda",
+        sources=[
+            "fmha_api.cpp",
+            "src/fmha_fprop_fp16_kernel.sm80.cu",
+            "src/fmha_dgrad_fp16_kernel_loop.sm80.cu",
+            "src/fmha_block_fprop_fp16_kernel.sm80.cu",
+            "src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu",
+        ],
+        extra_compile_args={
+            "cxx": ["-O3"] + generator_flag,
+            "nvcc": append_nvcc_threads(
+                [
+                    "-O3",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "--use_fast_math",
+                    "--ptxas-options=-v",
+                    "-lineinfo"
+                ]
+                + generator_flag
+                + cc_flag
+            ),
+        },
+        include_dirs=[
+            this_dir,
+            os.path.join(this_dir, "src"),
+        ],
+    )
+)
+setup(
+    name="stream_attn_cuda",
+    version="0.1",
+    description="Streaming attention",
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension} if ext_modules else {},
+)
--- a/csrc/stream_attn/src/.DS_Store
+++ b/csrc/stream_attn/src/.DS_Store
--- a/csrc/stream_attn/src/fmha.h
+++ b/csrc/stream_attn/src/fmha.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+#include <cuda.h>
+#include <vector>
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <fmha_utils.h>
+constexpr int TOTAL_DIM = 0;
+constexpr int THREE_DIM = 1;
+constexpr int H_DIM = 2;
+constexpr int D_DIM = 3;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Qkv_params {
+    // The QKV matrices.
+    void * __restrict__ qkv_ptr;
+    // The stride between rows of the Q, K and V matrices.
+    // size_t qkv_stride_in_elts;
+    // size_t qkv_stride_in_bytes;
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    uint32_t qkv_stride_in_elts;
+    uint32_t qkv_stride_in_bytes;
+    // The number of heads.
+    int h;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Fused_multihead_attention_fprop_params : public Qkv_params {
+    // The dQKV matrices.
+    void * __restrict__ dqkv_ptr;
+    // Temporary for dKV.
+    void * __restrict__ dkv_ptr;
+    // The O matrix (output).
+    void * __restrict__ o_ptr;
+    // The stride between rows of O.
+    // size_t o_stride_in_elts;
+    // size_t o_stride_in_bytes;
+    uint32_t o_stride_in_elts;
+    uint32_t o_stride_in_bytes;
+    // The pointer to the O_tmp matrix, which holds O intermediate value during
+    // the loop;
+    void *__restrict__ o_tmp_ptr;
+    // The dO matrix .
+    void * __restrict__ do_ptr;
+    // The pointer to the S matrix, overwritten by the dP matrix (bwd).
+    void * __restrict__ s_ptr;
+    // The stride between rows of the S matrix.
+    // int64_t s_stride_in_bytes;
+    uint32_t s_stride_in_bytes;
+    // The pointer to the softmax sum.
+    void * __restrict__ softmax_lse_ptr;
+    // The pointer to the softmax d sum.
+    void * __restrict__ dsoftmax_sum;
+    // The dimensions.
+    int b, s, d;
+    // The scaling factors for the kernel.
+    float scale_bmm1f;
+    uint32_t scale_bmm1, scale_softmax, scale_bmm2;
+    // array of length b+1 holding starting offset of each sequence.
+    int * __restrict__ cu_seqlens;
+    int *__restrict__ blockmask;
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+    uint32_t p_dropout_in_uint;
+    uint16_t p_dropout_in_uint16_t;
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+    // Scale factor of 1 / (1 - p_dropout), in half2.
+    uint32_t scale_dropout;
+    // Random state.
+    at::PhiloxCudaState philox_args;
+    bool is_causal;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename Kernel_params> 
+struct Launch_params{
+    Launch_params(cudaDeviceProp * props_,
+                  cudaStream_t stream_,
+                  bool is_dropout_,
+                  bool return_softmax_)
+        : elts_per_thread(0)
+        , props(props_)
+        , stream(stream_)
+        , is_dropout(is_dropout_)
+        , return_softmax(return_softmax_) {
+    }
+    size_t elts_per_thread;
+    cudaDeviceProp * props;
+    cudaStream_t stream;
+    bool is_dropout;
+    bool return_softmax;
+    Kernel_params params;
+    int num_full_heads;
+    int num_main_groups;
+    int heads_last_wave;
+    int main_steps;
+    int rest_steps;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void run_fmha_fp16_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params, const bool configure);
+void run_fmha_dgrad_fp16_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
+void run_fmha_block_fp16_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params, const bool configure);
+void run_fmha_block_dgrad_fp16_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/stream_attn/src/fmha/gemm.h
+++ b/csrc/stream_attn/src/fmha/gemm.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+#include <fmha/utils.h>
+namespace fmha {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_ >
+struct Fragment_base_ {
+    // The data type.
+    using Data_type = Data_type_;
+    // default input type
+    using Input_type_ = Data_type_;
+    // Does it store the array of elements.
+    static constexpr bool HAS_ELTS = BITS_PER_ELT_ >= 8;
+    // The number of elements.
+    static constexpr int NUM_ELTS = NUM_ELTS_;
+    // The size of element in bits.
+    static constexpr int BITS_PER_ELT = BITS_PER_ELT_;
+    // The size of byte of a single register.
+    static constexpr int BYTES_PER_REG = 4;
+    // The size in bits.
+    static constexpr int BITS_PER_REG = BYTES_PER_REG * 8;
+    // The number of registers needed to store the fragment.
+    static constexpr int NUM_REGS = DivUpConstexpr(NUM_ELTS * BITS_PER_ELT, BITS_PER_REG);
+    // The size in bytes (as returned by sizeof(Fragment_base<>).
+    static constexpr int SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG;
+    // The alignment.
+    static constexpr int ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : MinConstexpr(NUM_REGS * BYTES_PER_REG, 16);
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+    // The type of the elements.
+    typename Data_type_,
+    // The number of elements.
+    int NUM_ELTS_,
+    // The alignment if you want to force a value -- use 0 otherwise.
+    int ALIGNMENT_ = 0,
+    // The base class.
+    typename Base_ = Fragment_base_<Data_type_, NUM_ELTS_, 8 * sizeof(Data_type_), ALIGNMENT_>
+>
+struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment : public Base_ {
+    // The size of a load/store.
+    static constexpr int BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t);
+    // Clear the fragment. Using PTX in that code seems to produce better SASS...
+    inline __device__ void clear() {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) : );
+        }
+    }
+    // Immutable access to a register.
+    inline __device__ const uint32_t& reg(int ii) const {
+        return this->regs_[ii];
+    }
+    // Mutable access to a register.
+    inline __device__ uint32_t& reg(int ii) {
+        return this->regs_[ii];
+    }
+    uint32_t regs_[Base_::NUM_REGS];
+    // Immutable access to the elements.
+    inline __device__ const Data_type_& elt(int ii) const {
+        return reinterpret_cast<const Data_type_*>(&this->regs_[0])[ii];
+    }
+    // Mutable access to the elements.
+    inline __device__ Data_type_& elt(int ii) {
+        return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii];
+    }
+    // Immutable access to the elements with a cast.
+    template< typename Cast_type >
+    inline __device__ const Cast_type& elt_as(int ii) const {
+        return reinterpret_cast<const Cast_type*>(&this->regs_[0])[ii];
+    }
+    // Mutable access to the elements.
+    template< typename Cast_type >
+    inline __device__ Cast_type& elt_as(int ii) {
+        return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
+    }
+    // Add another fragment.
+    inline __device__ void add(const Fragment &other) {
+        // TODO (TD 2022-04-09): Shouldn't this be NUM_REGS instead of NUM_ELTS?
+        // Also are we doing int addition or __half2 addition?
+        #pragma unroll
+        for( int ii = 0; ii < NUM_ELTS_; ++ii ) {
+            this->elt(ii) += other.elt(ii);
+        }
+    }
+    // Multiply by another fragment.
+    inline __device__ void hmul(const Fragment &other) {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            this->reg(ii) = fmha::hmul2(this->reg(ii), other.reg(ii));
+        }
+    }
+    inline __device__ void hrelu_() {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            this->reg(ii) = fmha::hrelu2(this->reg(ii));
+        }
+    }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename Layout >
+struct Fragment_a : public Fragment<uint16_t, 8> {
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename Layout >
+struct Fragment_b : public Fragment<uint16_t, 8> {
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Fragment_accumulator : public Fragment<float, 8> {
+    // The base class.
+    using Base = Fragment<float, 8>;
+    // Add two fragments.
+    template< typename Other_fragment_ >
+    inline __device__ void add(const Other_fragment_ &other) {
+        for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
+            this->elt(ii) = this->elt(ii) + other.elt(ii);
+        }
+    }
+    inline __device__ void mul_(const float other) {
+        for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
+            this->elt(ii) *= other;
+        }
+    }
+    // Do the HMMA.
+    template< typename Layout_a, typename Layout_b >
+    inline __device__ void mma(const Fragment_a<Layout_a> &a,
+                               const Fragment_b<Layout_b> &b) {
+        asm volatile( \
+            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
+            "    {%0, %1, %2, %3}, \n" \
+            "    {%4, %5, %6, %7}, \n" \
+            "    {%8, %9}, \n" \
+            "    {%0, %1, %2, %3}; \n" \
+                    : "+f"(  elt(0)), "+f"(  elt(1)), "+f"(  elt(2)), "+f"(  elt(3))
+                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
+                    ,  "r"(b.reg(0)),  "r"(b.reg(1)));
+        asm volatile( \
+            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
+            "    {%0, %1, %2, %3}, \n" \
+            "    {%4, %5, %6, %7}, \n" \
+            "    {%8, %9}, \n" \
+            "    {%0, %1, %2, %3}; \n" \
+                    : "+f"(  elt(4)), "+f"(  elt(5)), "+f"(  elt(6)), "+f"(  elt(7))
+                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
+                    ,  "r"(b.reg(2)),  "r"(b.reg(3)));
+    }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename Fragment, int M, int N >
+inline __device__ void clear(Fragment (&frag)[M][N]) {
+    #pragma unroll
+    for( int mi = 0; mi < M; ++mi ) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ++ni ) {
+            frag[mi][ni].clear();
+        }
+    }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename Accumulator_type, int WARPS_K >
+struct Clear_accumulator {
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< int WARPS_K >
+struct Clear_accumulator<float, WARPS_K> {
+  template< typename Acc, int M, int N >
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
+    #pragma unroll
+    for( int mi = 0; mi < M; ++mi ) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ++ni ) {
+            acc[mi][ni].mma(a[mi], b[ni]);
+        }
+    }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+    // The number of rows in the CTA tile.
+    int M_,
+    // The number of cols in the CTA tile.
+    int N_,
+    // The number of elements in the the K dimension of the GEMM loop.
+    int K_,
+    // The number of rows of warps.
+    int WARPS_M_,
+    // The number of cols of warps.
+    int WARPS_N_,
+    // The number of warps in the K dimension of the GEMM loop.
+    int WARPS_K_>
+struct Cta_tile_ {
+    static constexpr int M = M_, N = N_, K = K_;
+    // The number of warps.
+    static constexpr int WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_;
+    // The number of warps per CTA.
+    static constexpr int WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K;
+    // The number of threads per warp.
+    static constexpr int THREADS_PER_WARP = 32;
+    // The number of threads per CTA.
+    static constexpr int THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename Cta_tile>
+struct Hmma_tile {
+    // The number of elements computed with a single warp-MMA.
+    static constexpr int M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16;
+    // The number of elements computed with a single CTA-MMA.
+    static constexpr int M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
+        N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
+        K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K;
+    // The number of MMAs needed to compute the GEMM.
+    static constexpr int MMAS_M = DivUpConstexpr(Cta_tile::M, M_PER_MMA_PER_CTA),
+        MMAS_N = DivUpConstexpr(Cta_tile::N, N_PER_MMA_PER_CTA),
+        MMAS_K = DivUpConstexpr(Cta_tile::K, K_PER_MMA_PER_CTA);
+    // // The number of elements computed per warp.
+    // static constexpr int M_PER_WARP = MMAS_M * M_PER_MMA,
+    //     N_PER_WARP = MMAS_N * N_PER_MMA,
+    //     K_PER_WARP = MMAS_K * K_PER_MMA;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+using A_type = uint16_t;
+using B_type = uint16_t;
+using C_type = uint16_t;
+using Accumulator_type = float;
+using Epilogue_type = float;
+constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
+constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
+constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
+using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename Cta_tile_>
+using Cta_tile_with_k_with_padding = Cta_tile_extd<Cta_tile_::M,
+                                                   Cta_tile_::N,
+                                                   Next_power_of_two<Cta_tile_::K>::VALUE,
+                                                   Cta_tile_::WARPS_M,
+                                                   Cta_tile_::WARPS_N,
+                                                   Cta_tile_::WARPS_K>;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace fmha
--- a/csrc/stream_attn/src/fmha/gmem_tile.h
+++ b/csrc/stream_attn/src/fmha/gmem_tile.h
--- a/csrc/stream_attn/src/fmha/kernel_traits.h
+++ b/csrc/stream_attn/src/fmha/kernel_traits.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int S, int D, int STEP, int WARPS_M, int WARPS_N, uint32_t FLAGS = 0x08u>
+struct FMHA_kernel_traits {
+    // The CTA description for the 1st GEMM.
+    using Cta_tile_p = fmha::Cta_tile_extd<STEP, S, D, WARPS_M, WARPS_N, 1>;
+    // The CTA description for the 2nd GEMM.
+    using Cta_tile_o = fmha::Cta_tile_extd<STEP, D, S, WARPS_M, 1, WARPS_N>;
+    // Do we use one buffer for K and V.
+    static constexpr bool SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x08u) != 0u;
+    // Do we keep K in registers.
+    static constexpr bool K_IN_REGS = (FLAGS & 0x10u) == 0u;
+    // Do we keep V in registers.
+    static constexpr bool V_IN_REGS = (FLAGS & 0x100u) == 0u;
+    // The global memory tile to load Q.
+    using Gmem_tile_q = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;
+    // The shared memory tile to swizzle Q.
+    // using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 1>;
+    using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
+    // The global memory tile to load K.
+    using Gmem_tile_k = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_B, S, D>;
+    // The shared memory tile to swizzle K.
+    using Smem_tile_k = fmha::Smem_tile_b<Cta_tile_p, fmha::Col>;
+    // The global memory tile to load V.
+    using Gmem_tile_v = fmha::Gmem_tile_qkv<Cta_tile_o, fmha::BITS_PER_ELEMENT_B, S, D>;
+    // The shared memory tile to swizzle V.
+    using Smem_tile_v = fmha::Smem_tile_v<Cta_tile_o>;
+    // The global memory tile to store O.
+    using Gmem_tile_o = fmha::Gmem_tile_o<Cta_tile_o>;
+    // The shared memory tile for O.
+    using Smem_tile_o = fmha::Smem_tile_o<Cta_tile_o>;;
+    // The global memory tile to load/store S.
+    using Gmem_tile_s = fmha::Gmem_tile_mma_s<Cta_tile_p>;
+    // The shared memory tile to transpose S.
+    using Smem_tile_st = fmha::Smem_tile_mma_transposed<Cta_tile_p>;
+    using Gmem_tile_do = fmha::Gmem_tile_dout<Cta_tile_p>;
+    using Gmem_tile_dot = fmha::Gmem_tile_dout<Cta_tile_p, fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_B, S, D> >;
+    // The global memory tile to store the softmax sum.
+    using Gmem_softmax_sum = fmha::Gmem_summary_stats<Cta_tile_p>;
+    // The shared memory tile to store dp sum.
+    using Smem_dp_sum = fmha::Smem_tile_dp_sum<Gmem_tile_q, 2>;
+    // Make sure the number of threads match.
+    static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
+    // The number of threads.
+    static constexpr int THREADS = Cta_tile_p::THREADS_PER_CTA;
+    // Make sure the number of threads matches both CTAs.
+    static_assert(THREADS == Cta_tile_o::THREADS_PER_CTA, "");
+    // The amount of shared memory needed to load Q and K.
+    static constexpr int BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE;
+    // The extra amount of shared memory needed to load V.
+    static constexpr int BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE;
+    // The amount of shared memory needed for Q, K and V..
+    static constexpr int BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V;
+    // The amount of shared memory needed to load Q and store O.
+    static constexpr int BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE;
+    // The amount of shared memory needed for Q, K, V and O.
+    static constexpr int BYTES_PER_SMEM = fmha::MaxConstexpr(BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO);
+    // Make sure we have enough shared memory.
+    static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/stream_attn/src/fmha/mask.h
+++ b/csrc/stream_attn/src/fmha/mask.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+namespace fmha {
+template<typename Cta_tile, bool Is_causal=false>
+struct Mask {
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    template<typename BInfo>
+    __device__ Mask(const BInfo &blockInfo, int tidx, const int loop_step_idx_ = 0)
+        : actual_seqlen(blockInfo.actual_seqlen - loop_step_idx_ * Cta_tile::N)
+        , loop_step_idx(loop_step_idx_) {
+        const int warp = tidx / Cta_tile::THREADS_PER_WARP;
+        const int lane = tidx % Cta_tile::THREADS_PER_WARP;
+        static_assert(Cta_tile::WARPS_K == 1, "");
+        // find the warp in the Cta tile
+        const int warp_n = (warp / Cta_tile::WARPS_M);
+        const int warp_m = (warp % Cta_tile::WARPS_M);
+        // decompose warp into 8x4 tile
+        const int quad = lane / 4;
+        const int tid = (lane % 4) * 2;
+        row = warp_m * 16 + quad;
+        col = warp_n * 16 + tid;
+    }
+    inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
+        // ii and jj iterate over the 2x4 fragment
+        // const int current_col = (Is_causal ? loop_step_idx * Cta_tile::N : 0) + ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
+        const int current_col = ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
+        const int current_row = row_offset + ii * 8;
+        const bool col_valid = current_col < actual_seqlen;
+        // const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen;
+        //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen;
+        bool all_valid = Is_causal ? col_valid && (current_col + loop_step_idx * Cta_tile::N <= current_row) : col_valid;
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("current_col=%d, current_row=%d, actual_seqlen=%d, col_valid=%d, all_valid=%d\n", current_col, current_row, actual_seqlen, col_valid, all_valid);
+        // }
+        return Is_causal ? col_valid && (current_col + loop_step_idx * Cta_tile::N <= current_row) : col_valid;
+        // return row_valid && col_valid;
+    }
+    //BERT Mask: if upper left is invalid, none are valid
+    inline __device__ bool any_valid(const int mi, const int ni) const {
+        return is_valid(mi, ni, 0, 0) || is_valid(mi, ni, 1, 0);
+    }
+    inline __device__ void load(const int it) {
+        row_offset = it * Cta_tile::M + row;
+    }
+    int row_offset;
+    int row;
+    int col;
+    const int loop_step_idx;
+    const int actual_seqlen;
+};
+}  // namespace fmha
--- a/csrc/stream_attn/src/fmha/smem_tile.h
+++ b/csrc/stream_attn/src/fmha/smem_tile.h
--- a/csrc/stream_attn/src/fmha/softmax.h
+++ b/csrc/stream_attn/src/fmha/softmax.h
--- a/csrc/stream_attn/src/fmha/utils.h
+++ b/csrc/stream_attn/src/fmha/utils.h
--- a/csrc/stream_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu
+++ b/csrc/stream_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu
+/* Copyright (c) 2022, Tri Dao.
+ */
+#include "fmha.h"
+#include "fmha_block_dgrad_kernel_1xN_loop.h"
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, int loop_steps=-1>
+__global__ void fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel(Fused_multihead_attention_fprop_params params) {
+    fmha::compute_block_dq_dk_dv_1xN<Kernel_traits, Is_dropout, Is_causal, loop_steps>(params);
+}
+template<typename Kernel_traits>
+void run_fmha_block_dgrad_fp16_sm80_loop_(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream) {
+    constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
+    constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
+    constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
+    constexpr int smem_size_dq = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
+    constexpr int smem_size_dp_sum = Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
+    using Smem_tile_s = fmha::Smem_tile_mma_transposed<typename Kernel_traits::Cta_tile_p>;
+    constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
+    static_assert(smem_size_s == 16 * Kernel_traits::Cta_tile_p::N * 2);
+    static_assert(smem_size_dq == 16 * Kernel_traits::Cta_tile_p::K * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
+    static_assert(smem_size_dp_sum == 16 * 4 * 2);
+    constexpr int smem_size_dq_dk_dv = smem_size_q * 2 + smem_size_v * (Kernel_traits::V_IN_REGS ? 1 : 2) + smem_size_dq + smem_size_s * 2 + smem_size_dp_sum;
+    bool is_dropout = params.p_dropout < 1.f;  // params.p_dropout is the probability of "keeping"
+    bool is_causal = params.is_causal;
+    auto kernel = is_dropout
+        ? (is_causal ? &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, true, true> : &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, true, false>)
+        : (is_causal ? &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, false, true> : &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, false, false>);
+    constexpr int N = Kernel_traits::Cta_tile_p::N;
+    if (params.s == N) {
+        kernel = is_dropout
+            ? (is_causal ? &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, true, true, /*loop_steps=*/1> : &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, true, false, /*loop_steps=*/1>)
+            : (is_causal ? &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, false, true, /*loop_steps=*/1> : &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, false, false, /*loop_steps=*/1>);
+    } else if (params.s == N * 2) {
+        kernel = is_dropout
+            ? (is_causal ? &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, true, true, /*loop_steps=*/2> : &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, true, false, /*loop_steps=*/2>)
+            : (is_causal ? &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, false, true, /*loop_steps=*/2> : &fmha_block_dgrad_fp16_sm80_dq_dk_dv_loop_kernel<Kernel_traits, false, false, /*loop_steps=*/2>);
+    }
+    if( smem_size_dq_dk_dv >= 48 * 1024 ) {
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
+    }
+    dim3 grid(params.h, params.b);
+    kernel<<<grid, Kernel_traits::THREADS, smem_size_dq_dk_dv, stream>>>(params);
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+}
+void run_fmha_block_dgrad_fp16_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream) {
+    if (params.d == 16) {
+        using Kernel_traits = FMHA_kernel_traits<256, 16, 16, 1, 8, 0x08u>;
+        run_fmha_block_dgrad_fp16_sm80_loop_<Kernel_traits>(params, stream);
+    } else if (params.d == 32) {
+        using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 8, 0x08u>;
+        run_fmha_block_dgrad_fp16_sm80_loop_<Kernel_traits>(params, stream);
+    } else if (params.d == 64) {
+        using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x100u>;
+        run_fmha_block_dgrad_fp16_sm80_loop_<Kernel_traits>(params, stream);
+    }
+}
\ No newline at end of file
--- a/csrc/stream_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
+++ b/csrc/stream_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
--- a/csrc/stream_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu
+++ b/csrc/stream_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "fmha.h"
+#include "fmha_block_fprop_kernel_1xN.h"
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
+__global__ void fmha_block_fprop_fp16_sm80_loop_kernel(Fused_multihead_attention_fprop_params params) {
+    fmha::device_block_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
+}
+template<typename Kernel_traits>
+void run_fmha_block_fp16_sm80_loop_(Launch_params<Fused_multihead_attention_fprop_params> &launch_params,
+                            const bool configure) {
+    bool is_causal = launch_params.params.is_causal;
+    // TD [2022-04-27]: This case work is pretty ugly, maybe there's a better way?
+    auto kernel = launch_params.is_dropout
+        ? (is_causal
+           ? (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, true, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, true, false>)
+           : (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, false, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, true, false, false>))
+        : (is_causal
+           ? (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, true, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, true, false>)
+           : (launch_params.return_softmax ? &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, false, true> : &fmha_block_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, false, false>));
+    constexpr int N = Kernel_traits::Cta_tile_p::N;
+    const int loop_steps = (launch_params.params.s + N - 1) / N;
+    constexpr int smem_size_softmax_lse = Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
+    // Don't need smem_size_softmax_lse if we're not looping
+    const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
+        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
+    if( smem_size >= 48 * 1024 ) {
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    if (configure) {
+        using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
+        constexpr int M = Kernel_traits::Cta_tile_p::M;
+        size_t STEPS = (launch_params.params.s + M - 1) / M;
+        constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
+        constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
+        size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
+        launch_params.elts_per_thread = elts_per_head;
+        return;
+    }
+    dim3 grid(launch_params.params.h, launch_params.params.b);
+    kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
+        launch_params.params);
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+}
+void run_fmha_block_fp16_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params,
+                             const bool configure) {
+    if (launch_params.params.d == 16) {
+        using Kernel_traits = FMHA_kernel_traits<256, 16, 16, 1, 4, 0x08u>;
+        run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else if (launch_params.params.d == 32) {
+        using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u>;
+        run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else if (launch_params.params.d == 64) {
+        using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u>;
+        run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    }
+}
\ No newline at end of file
--- a/csrc/stream_attn/src/fmha_block_fprop_kernel_1xN.h
+++ b/csrc/stream_attn/src/fmha_block_fprop_kernel_1xN.h
--- a/csrc/stream_attn/src/fmha_blockmask.h
+++ b/csrc/stream_attn/src/fmha_blockmask.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+#include <fmha.h>
+#include <fmha/utils.h>
+#include <fmha/smem_tile.h>
+#include <fmha/gmem_tile.h>
+#include <fmha/mask.h>
+#include <fmha/softmax.h>
+namespace fmha {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Blockmask {
+    template<typename Params>
+    __device__ Blockmask(const Params &params, int loop_step_idx) :
+        blockmask_ptr(params.blockmask + loop_step_idx * params.s / 16) {
+    }
+    __device__ int mask_val(int block_row_idx) const {
+        return blockmask_ptr[block_row_idx];
+    }
+    const int *blockmask_ptr;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace fmha
--- a/csrc/stream_attn/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu
+++ b/csrc/stream_attn/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "fmha.h"
+#include "fmha_fprop_kernel_1xN.h"
+// #include "fmha_dgrad_kernel_1xN_reload.h"
+#include "fmha_dgrad_kernel_1xN_reload_recompute.h"
+using Kernel_traits = FMHA_kernel_traits<512, 64, 16, 1, 8, 0x08u>;
+// extern "C" __global__ void fmha_dgrad_fp16_512_64_sm80_dv_kernel(Fused_multihead_attention_fprop_params params) {
+    // fmha::compute_dv_1xN<Kernel_traits>(params);
+// }
+// extern "C" __global__ void fmha_dgrad_fp16_512_64_sm80_dq_dk_kernel(Fused_multihead_attention_fprop_params params) {
+//     fmha::compute_dq_dk_1xN<Kernel_traits>(params);
+// }
+extern "C" __global__ void fmha_dgrad_fp16_512_64_sm80_dp_dq_kernel(Fused_multihead_attention_fprop_params params) {
+    fmha::compute_dp_dq_1xN<Kernel_traits>(params);
+}
+extern "C" __global__ void fmha_dgrad_fp16_512_64_sm80_dv_dk_kernel(Fused_multihead_attention_fprop_params params) {
+    fmha::compute_dv_dk_1xN<Kernel_traits>(params);
+}
+void run_fmha_dgrad_fp16_512_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream) {
+    constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
+    constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
+    constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
+    constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
+    using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>;
+    constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
+    static_assert(smem_size_s == 16 * 512 * 2);
+    static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
+    // constexpr int smem_size_dp_dq = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
+    // constexpr int smem_size_dv_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
+    constexpr int smem_size_dp_dq = smem_size_q * 2 + smem_size_q + smem_size_v + smem_size_o;
+    constexpr int smem_size_dv_dk = smem_size_q + smem_size_q + smem_size_v + smem_size_o + smem_size_s;
+    if( smem_size_dp_dq >= 48 * 1024 ) {
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+            // fmha_dgrad_fp16_512_64_sm80_dv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dp_dq));
+            fmha_dgrad_fp16_512_64_sm80_dp_dq_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dp_dq));
+    }
+    if( smem_size_dv_dk >= 48 * 1024 ) {
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+            fmha_dgrad_fp16_512_64_sm80_dv_dk_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dv_dk));
+    }
+    dim3 grid(params.h, params.b);
+    // fmha_dgrad_fp16_512_64_sm80_dv_kernel<<<grid, Kernel_traits::THREADS, smem_size_dp_dq, stream>>>(params);
+    // fmha_dgrad_fp16_512_64_sm80_dp_dq_kernel<<<grid, Kernel_traits::THREADS, smem_size_dp_dq, stream>>>(params);
+    fmha_dgrad_fp16_512_64_sm80_dp_dq_kernel<<<grid, Kernel_traits::THREADS, smem_size_dp_dq, stream>>>(params);
+    fmha_dgrad_fp16_512_64_sm80_dv_dk_kernel<<<grid, Kernel_traits::THREADS, smem_size_dv_dk, stream>>>(params);
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+}