Merge pull request #55 from ROCmSoftwarePlatform/IFU-master-2021-10-15

IFU-2021-10-15 (+ remove redundant defines + C10_CUDA_CHECK)

Merge pull request #55 from ROCmSoftwarePlatform/IFU-master-2021-10-15
IFU-2021-10-15 (+ remove redundant defines + C10_CUDA_CHECK)
cc92a4b4 · Jithun Nair · GitHub · 1e0f9bc6 · fec3141c · cc92a4b4
Unverified Commit cc92a4b4 authored Dec 08, 2021 by Jithun Nair Committed by GitHub Dec 08, 2021
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,147 @@ build
 docs/build
 *~
 __pycache__
+.vscode
+
+# Copied from https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
 *.hip
 *_hip.*
 *hip* 
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,3 +2,6 @@
 	path = apex/contrib/csrc/multihead_attn/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
 	branch = v1.2.0
+[submodule "apex/contrib/csrc/cudnn-frontend"]
+	path = apex/contrib/csrc/cudnn-frontend
+	url = https://github.com/NVIDIA/cudnn-frontend.git
--- a/apex/__init__.py
+++ b/apex/__init__.py
@@ -21,3 +21,4 @@ from . import pyprof

 #common utilties to run tests on ROCm.
 from . import testing
+from . import transformer
--- a/apex/_autocast_utils.py
+++ b/apex/_autocast_utils.py
+import torch
+
+
+def _cast_if_autocast_enabled(*args):
+    if not torch.is_autocast_enabled():
+        return args
+    else:
+        return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
--- a/apex/contrib/bottleneck/__init__.py
+++ b/apex/contrib/bottleneck/__init__.py
+from .bottleneck import Bottleneck, SpatialBottleneck
--- a/apex/contrib/bottleneck/bottleneck.py
+++ b/apex/contrib/bottleneck/bottleneck.py
--- a/apex/contrib/bottleneck/bottleneck_module_test.py
+++ b/apex/contrib/bottleneck/bottleneck_module_test.py
+import os
+import torch
+from maskrcnn_benchmark.modeling.backbone.resnet import Bottleneck
+from maskrcnn_benchmark.layers.nhwc import nhwc_to_nchw_transform, nchw_to_nhwc_transform
+from maskrcnn_benchmark.layers.nhwc.batch_norm import FrozenBatchNorm2d_NHWC
+from apex.contrib.bottleneck import Bottleneck as FastBottleneck
+from apex.contrib.bottleneck import SpatialBottleneck
+
+
+def single_module_test(ref, rank, world_size, numtype, device, shape, fast, spatial_group_size, in_channels, bottleneck_channels, out_channels, num_groups, stride_in_1x1, stride, dilation, norm_func, nhwc):
+    # inputs + modules
+    with torch.no_grad():
+        input_shape = [1, in_channels] + list(shape)
+        x = torch.randn(input_shape, dtype=numtype, device=device)
+        if nhwc:
+            x = nchw_to_nhwc_transform(x).contiguous()
+        x.requires_grad = True
+        print(x.shape, x.stride())
+
+        #if spatial_group_size > 1:
+        #    fast = False # hack so fast bottleneck can be run against distributed bottleneck
+        #if spatial_group_size == 1:
+        #    fast = False
+
+        if fast:
+            if spatial_group_size == 1:
+                bottleneck = FastBottleneck(
+                    in_channels=in_channels,
+                    bottleneck_channels=bottleneck_channels,
+                    out_channels=out_channels,
+                    stride=stride,
+                    dilation=dilation,
+                    explicit_nhwc=nhwc,
+                    use_cudnn=True)
+            else:
+                bottleneck = SpatialBottleneck(
+                    in_channels=in_channels,
+                    bottleneck_channels=bottleneck_channels,
+                    out_channels=out_channels,
+                    stride=stride,
+                    dilation=dilation,
+                    explicit_nhwc=nhwc,
+                    use_cudnn=True,
+                    spatial_group_size=spatial_group_size)
+        else:
+            bottleneck = Bottleneck(
+                in_channels,
+                bottleneck_channels,
+                out_channels,
+                num_groups,
+                stride_in_1x1,
+                stride,
+                dilation,
+                norm_func,
+                nhwc,
+                spatial_group_size)
+        bottleneck = bottleneck.to(dtype=numtype,device=device)
+        weights = dict(bottleneck.named_parameters())
+
+        if ref is not None:
+            ref_x, _, ref_weights = ref
+            Hs,H = x.shape[1], ref_x.shape[1]
+            assert(Hs*spatial_group_size == H), "Hs not a multiple of H"
+            ref_x = ref_x[:,rank*Hs:(rank+1)*Hs,:,:]
+            x.copy_(ref_x)
+            assert(len(weights) == len(ref_weights)), "Reference weights and weights don't match"
+            for k in weights.keys():
+                weights[k].copy_(ref_weights[k])
+
+    # forward
+    out = bottleneck(x)
+    
+    # gradient output
+    with torch.no_grad():
+        grad_out = torch.randn_like(out)
+        if ref is not None:
+            _, ref_grad_out, _ = ref
+            Hs,H = grad_out.shape[1], ref_grad_out.shape[1]
+            assert(Hs*spatial_group_size == H), "Hs not a multiple of H"
+            ref_grad_out = ref_grad_out[:,rank*Hs:(rank+1)*Hs,:,:]
+            grad_out.copy_(ref_grad_out)
+
+    # backward
+    out.backward(grad_out)
+
+    with torch.no_grad():
+        dgrad = x.grad.detach()
+        
+        wgrad = {}
+        for n,p in bottleneck.named_parameters():
+            wgrad[n] = p.grad.detach()
+
+    if world_size > 1:
+        if spatial_group_size == 1:
+            # broadcast x, grad_out and weights from rank 0
+            with torch.no_grad():
+                torch.distributed.broadcast(x,0)
+                torch.distributed.broadcast(grad_out,0)
+                for k in weights.keys():
+                    torch.distributed.broadcast(weights[k],0)
+        else:
+            # gather dgrad (x.grad), sum wgrad (weights) and out
+            N,Hs,W,C = dgrad.shape
+            H = Hs * spatial_group_size
+            dgrad_gathered = torch.empty((N,H,W,C),dtype=dgrad.dtype,device=dgrad.device)
+            dgrad_tensors = [dgrad_gathered[:,i*Hs:(i+1)*Hs,:,:] for i in range(spatial_group_size)]
+            torch.distributed.all_gather(dgrad_tensors, dgrad)
+            dgrad = dgrad_gathered
+            N,Hs,W,C = list(out.shape)
+            H = Hs * spatial_group_size
+            out_gathered = torch.empty((N,H,W,C),dtype=dgrad.dtype,device=dgrad.device)
+            out_tensors= [out_gathered[:,i*Hs:(i+1)*Hs,:,:] for i in range(spatial_group_size)]
+            torch.distributed.all_gather(out_tensors, out)
+            out = out_gathered
+            for k in wgrad.keys():
+                w = wgrad[k].to(dtype=torch.float64)
+                torch.distributed.all_reduce(w)
+                wgrad[k].copy_(w.to(dtype=wgrad[k].dtype))
+                #torch.distributed.all_reduce(wgrad[k])
+
+    return x, out, grad_out, weights, dgrad, wgrad
+
+
+def module_tests(rank, world_size, numtype, device, fast, spatial_group_sizes, init_args):
+    r = []
+    for ia in init_args:
+        shape = ia[0:4]
+        args = ia[4:]
+        rr = []
+        ref = None
+        for spatial_group_size in spatial_group_sizes:
+            N,H,W,C = shape
+            H = H//spatial_group_size
+            x, out, grad_out, weights, dgrad, wgrad = single_module_test(ref, rank, world_size, numtype, device, [H,W], fast, spatial_group_size, *args)
+            if ref is None:
+                assert(spatial_group_size == 1), "Wrong reference weights"
+                ref = x, grad_out, weights
+            if rank == 0:
+                rr.append( (out, dgrad, wgrad) )
+            if world_size > 1: torch.distributed.barrier()
+        r.append(rr)
+    return r
+
+
+def main():
+    total_num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
+    distributed = total_num_gpus > 1
+    ngpus = torch.cuda.device_count()
+
+    if distributed:
+        torch.distributed.init_process_group("nccl")
+        rank, world_size = torch.distributed.get_rank(), torch.distributed.get_world_size()
+        is_master = True if rank == 0 else False
+        local_rank = rank % ngpus
+        torch.cuda.set_device(local_rank)
+        spatial_group_size = total_num_gpus
+    else:
+        rank, local_rank, is_master, world_size, spatial_group_size = 0, 0, True, 1, 1
+
+    torch.use_deterministic_algorithms(True)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+
+    norm_func = FrozenBatchNorm2d_NHWC
+
+    init_args = [
+        (1, 200, 336, 64, 64, 64, 256, 1, True, 1, 1, norm_func, True),
+        (1, 200, 336, 256, 256, 64, 256, 1, True, 1, 1, norm_func, True),
+        (1, 200, 336, 256, 256, 128, 512, 1, True, 2, 1, norm_func, True),
+        (1, 100, 168, 512, 512, 128, 512, 1, True, 1, 1, norm_func, True),
+        (1, 100, 168, 512, 512, 256, 1024, 1, True, 2, 1, norm_func, True),
+        (1, 50, 84, 1024, 1024, 256, 1024, 1, True, 1, 1, norm_func, True),
+        (1, 50, 84, 1024, 1024, 512, 2048, 1, True, 2, 1, norm_func, True),
+        (1, 25, 42, 2048, 2048, 512, 2048, 1, True, 1, 1, norm_func, True),
+        (1, 336, 200, 64, 64, 64, 256, 1, True, 1, 1, norm_func, True),
+        (1, 336, 200, 256, 256, 64, 256, 1, True, 1, 1, norm_func, True),
+        (1, 336, 200, 256, 256, 128, 512, 1, True, 2, 1, norm_func, True),
+        (1, 168, 100, 512, 512, 128, 512, 1, True, 1, 1, norm_func, True),
+        (1, 168, 100, 512, 512, 256, 1024, 1, True, 2, 1, norm_func, True),
+        (1, 84, 50, 1024, 1024, 256, 1024, 1, True, 1, 1, norm_func, True),
+        (1, 84, 50, 1024, 1024, 512, 2048, 1, True, 2, 1, norm_func, True),
+        (1, 42, 25, 2048, 2048, 512, 2048, 1, True, 1, 1, norm_func, True),
+        ]
+    init_args = init_args[0:1]
+
+    # pad H to account for spatial distribution 
+    padded_init_args = []
+    for ia in init_args:
+        N,H,W,C = ia[0:4]
+        m = spatial_group_size * H // (25 if H < W else 42)
+        H = ((H + m - 1) // m) * m
+        args = tuple( [N,H,W,C] + list(ia[4:]) )
+        padded_init_args.append(args)
+    init_args = padded_init_args
+    if rank == 0:
+        for ia in init_args:
+            print(ia)
+
+    spatial_group_sizes = [1]
+    if spatial_group_size > 1:
+        spatial_group_sizes.append(spatial_group_size)
+
+    numtype, device, fast = torch.float16, 'cuda', True
+    r = module_tests(rank, world_size, numtype, device, fast, spatial_group_sizes, init_args)
+    if world_size > 1: torch.distributed.barrier()
+    if rank == 0:
+        for rr in r:
+            print("***")
+            for out, dgrad, wgrad in rr:
+                gr = [("out",out.norm(p=2,dtype=torch.float64).item())]
+                gr = gr + [("dgrad",dgrad.norm(p=2,dtype=torch.float64).item())]
+                gr = gr + [(k+".wgrad",wgrad[k].norm(p=2,dtype=torch.float64).item()) for k in wgrad.keys()]
+                print(gr)
+            if len(rr) == 2:
+                out1, dgrad1, wgrad1 = rr[0]
+                out2, dgrad2, wgrad2 = rr[1]
+
+                rtol = 1e-1
+                out_atol = out1.abs().max().item() * rtol
+                dgrad_atol = dgrad1.abs().max().item() * rtol
+                wgrad_atol = {}
+                for k in wgrad1.keys():
+                    wgrad_atol[k] = wgrad1[k].abs().max().item() * rtol
+
+                gr = [("out",torch.allclose(out1,out2,rtol,out_atol,equal_nan=True))]
+                gr = gr + [("dgrad",torch.allclose(dgrad1,dgrad2,rtol,dgrad_atol,equal_nan=True))]
+                gr = gr + [(k+".wgrad",torch.allclose(wgrad1[k],wgrad2[k],rtol,wgrad_atol[k],equal_nan=True)) for k in wgrad1.keys()]
+                print(gr)
+
+                gr = [("out",(out1-out2).norm(p=2,dtype=torch.float64).item())]
+                gr = gr + [("dgrad",(dgrad1-dgrad2).norm(p=2,dtype=torch.float64).item())]
+                gr = gr + [(k+".wgrad",(wgrad1[k]-wgrad2[k]).norm(p=2,dtype=torch.float64).item()) for k in wgrad1.keys()]
+                print(gr)
+
+                N,H,W,C = out1.shape
+                Hs = H // spatial_group_size
+                Ht = Hs-2
+                print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+                print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+                Ht = Hs-1
+                print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+                print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+                Ht = Hs
+                print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+                print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+                Ht = Hs+1
+                print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+                print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+
+                N,H,W,C = dgrad1.shape
+                Hs = H // spatial_group_size
+                Ht = Hs-2
+                print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+                print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+                Ht = Hs-1
+                print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+                print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+                Ht = Hs
+                print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+                print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+                Ht = Hs+1
+                print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+                print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+
+
+    if world_size > 1: torch.distributed.barrier()
+
+
+if __name__ == "__main__":
+    main()
--- a/apex/contrib/bottleneck/test.py
+++ b/apex/contrib/bottleneck/test.py
+import torch
+from bottleneck import Bottleneck
+torch.manual_seed(23337)
+
+# use True to print layerwise sum for all outputs in reference code path
+DEBUG = False#True
+
+for stride, o_channel in [(1,32), (1,128), (2,32)]:
+    print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
+    a_ = torch.randn(17,32,28,28)
+
+    a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
+    model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)
+
+    # test model
+    b = model(a)
+    b.mean().backward()
+    d_grad = a.grad.float()
+    a.grad = None
+    torch.cuda.synchronize()
+
+    if DEBUG:
+        print("[DEBUG] ref dx :", d_grad.sum().item())
+        # print wgrad. we don't need to reset since later cpp print before accumulation
+        for i, w in enumerate(model.w_conv):
+            print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())
+
+    wgrads = []
+    for w in model.w_conv:
+        wgrads.append(w.grad.float())
+
+    model.use_cudnn = True
+    model.zero_grad()
+    c = model(a)
+    c.mean().backward()
+
+    torch.cuda.synchronize()
+    print("comparing native and channels_last:")
+    print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
+    print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
+    for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
+        print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
+
+    nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
+    nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
+    for p,q in zip(model.parameters(), nhwc_model.parameters()):
+        # model's storage is already in nhwc, we clone and assign to explicit nhwc model
+        q.data.copy_(p.data.permute(0,2,3,1).contiguous())
+    for p,q in zip(model.buffers(), nhwc_model.buffers()):
+        q.data.copy_(p.data)
+
+    d = nhwc_model(nhwc_a)
+    d.mean().backward()
+    torch.cuda.synchronize()
+
+    # reset reference to cudnn channels_last permute
+    #c_s = c.storage().tolist()
+    #d_s = d.storage().tolist()
+    #print(max([x-y for x,y in zip(c_s,d_s)]))
+    c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
+    d_grad = a.grad.float().permute(0,2,3,1).contiguous()
+    wgrads = []
+    for w in model.w_conv:
+        wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())
+
+    torch.cuda.synchronize()
+    print("comparing nhwc and channels_last:")
+    print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
+    print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
+    for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
+        print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
--- a/apex/contrib/csrc/bottleneck/bottleneck.cpp
+++ b/apex/contrib/csrc/bottleneck/bottleneck.cpp
--- a/cudnn-frontend @ b4e1ad96
+++ b/cudnn-frontend @ b4e1ad96
+Subproject commit b4e1ad9613b89199982c9baf6ee91f6f98f5606d
--- a/apex/contrib/csrc/fmha/fmha_api.cpp
+++ b/apex/contrib/csrc/fmha/fmha_api.cpp
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "fmha.h"
+
+void set_params(Fused_multihead_attention_fprop_params &params,
+                // sizes
+                const size_t b,
+                const size_t s,
+                const size_t h,
+                const size_t d,
+                // device pointers
+                void *qkv_packed_d,
+                void *cu_seqlens_d,
+                void *o_packed_d,
+                void *s_d,
+                float p_dropout) {
+
+    Data_type acc_type = DATA_TYPE_FP32;
+    Data_type data_type = DATA_TYPE_FP16;
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    // Set the pointers and strides.
+    params.qkv_ptr = qkv_packed_d;
+    params.qkv_stride_in_bytes = get_size_in_bytes(h * 3 * d, data_type);
+    params.o_ptr = o_packed_d;
+    params.o_stride_in_bytes = get_size_in_bytes(h * d, data_type);
+
+    params.cu_seqlens = static_cast<int *>(cu_seqlens_d);
+
+    // S = softmax(P)
+    params.s_ptr = s_d;
+    params.s_stride_in_bytes = get_size_in_bytes(b * h * s, data_type);
+
+    // Set the dimensions.
+    params.b = b;
+    params.h = h;
+    params.s = s;
+    params.d = d;
+
+    // Set the different scale values.
+    const float scale_bmm1 = 1.f / sqrtf(d);
+    constexpr float scale_softmax = 1.f;
+    constexpr float scale_bmm2 = 1.f;
+
+    set_alpha(params.scale_bmm1, scale_bmm1, acc_type);
+    set_alpha(params.scale_softmax, scale_softmax, acc_type);
+    set_alpha(params.scale_bmm2, scale_bmm2, data_type);
+
+    // Set this to probability of keeping an element to simplify things.
+    params.p_dropout = 1.f - p_dropout;
+    params.rp_dropout = 1.f / params.p_dropout;
+    TORCH_CHECK(p_dropout < 1.f);
+    set_alpha(params.scale_dropout, params.rp_dropout, data_type);
+}
+
+std::vector<at::Tensor>
+mha_fwd(const at::Tensor &qkv,  // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
+        const at::Tensor &cu_seqlens,  // b+1
+        const float p_dropout,
+        const int max_seq_len,
+        const bool is_training,
+        c10::optional<at::Generator> gen_) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    TORCH_CHECK(dprops->major == 8 && dprops->minor == 0);
+    int seq_len = 512;
+    auto launch = &run_fmha_fp16_512_64_sm80;
+    if( max_seq_len <= 128 ) {
+        seq_len = 128;
+        launch = &run_fmha_fp16_128_64_sm80;
+    } else if( max_seq_len <= 256 ) {
+        seq_len = 256;
+        launch = &run_fmha_fp16_256_64_sm80;
+    } else if( max_seq_len <= 384 ) {
+        seq_len = 384;
+        launch = &run_fmha_fp16_384_64_sm80;
+    } else if( max_seq_len <= 512 ) {
+        seq_len = 512;
+        launch = &run_fmha_fp16_512_64_sm80;
+    } else {
+        TORCH_CHECK(false);
+    }
+
+    constexpr int warps_m = 1;
+    constexpr int warps_n = 4;  // this leads to an upper bound
+    const int mmas_m = seq_len / 16 / warps_m;
+    const int mmas_n = seq_len / 16 / warps_n;
+    
+    const int elts_per_thread = 8 * mmas_m * mmas_n;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    TORCH_CHECK(qkv.dtype() == torch::kFloat16);
+    TORCH_CHECK(cu_seqlens.dtype() == torch::kInt32);
+
+    TORCH_CHECK(qkv.is_cuda())
+    TORCH_CHECK(cu_seqlens.is_cuda())
+
+    TORCH_CHECK(qkv.is_contiguous())
+    TORCH_CHECK(cu_seqlens.is_contiguous())
+
+    TORCH_CHECK(cu_seqlens.dim() == 1);
+    TORCH_CHECK(qkv.dim() == 4);
+
+    const auto sizes = qkv.sizes();
+
+    TORCH_CHECK(sizes[THREE_DIM] == 3);
+
+    const int batch_size = cu_seqlens.numel() - 1;
+    const int total = sizes[TOTAL_DIM];
+    const int num_heads = sizes[H_DIM];
+    const int head_size = sizes[D_DIM];
+    TORCH_CHECK(batch_size > 0);
+    TORCH_CHECK(head_size == 64);
+    auto opts = qkv.options();
+
+    auto ctx = torch::empty({ total, num_heads, head_size }, opts);
+
+    auto s = torch::empty({ batch_size, num_heads, seq_len, seq_len }, opts);
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    Fused_multihead_attention_fprop_params params;
+
+    set_params(params,
+               batch_size,
+               seq_len,
+               num_heads,
+               head_size,
+               qkv.data_ptr(),
+               cu_seqlens.data_ptr(),
+               ctx.data_ptr(),
+               s.data_ptr(),
+               p_dropout);
+
+    // number of times random will be generated per thread, to offset philox counter in the random
+    // state
+    int64_t counter_offset = elts_per_thread;
+    at::PhiloxCudaState rng_engine_inputs;
+
+    if( is_training ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+
+    launch(params, is_training, stream);
+
+    return { ctx, s };
+}
+
+std::vector<at::Tensor>
+mha_bwd(const at::Tensor &dout,  // total x num_heads, x head_size
+        const at::Tensor &qkv,   // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
+        at::Tensor &softmax,     // b x h x s x s softmax and dmask - will be overwritten with dP
+        const at::Tensor &cu_seqlens,  // b+1
+        const float p_dropout,         // probability to drop
+        const int max_seq_len          // max sequence length to choose the kernel
+) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    TORCH_CHECK(dprops->major == 8 && dprops->minor == 0);
+    int seq_len = 512;
+    auto launch = &run_fmha_dgrad_fp16_512_64_sm80;
+    if( max_seq_len <= 128 ) {
+        seq_len = 128;
+        launch = &run_fmha_dgrad_fp16_128_64_sm80;
+    } else if( max_seq_len <= 256 ) {
+        seq_len = 256;
+        launch = &run_fmha_dgrad_fp16_256_64_sm80;
+    } else if( max_seq_len <= 384 ) {
+        seq_len = 384;
+        launch = &run_fmha_dgrad_fp16_384_64_sm80;
+    } else if( max_seq_len <= 512 ) {
+        seq_len = 512;
+        launch = &run_fmha_dgrad_fp16_512_64_sm80;
+    } else {
+        TORCH_CHECK(false);
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    TORCH_CHECK(qkv.dtype() == torch::kFloat16);
+    TORCH_CHECK(dout.dtype() == torch::kFloat16);
+    TORCH_CHECK(softmax.dtype() == torch::kFloat16);
+    TORCH_CHECK(cu_seqlens.dtype() == torch::kInt32);
+
+    TORCH_CHECK(qkv.is_cuda());
+    TORCH_CHECK(cu_seqlens.is_cuda());
+
+    TORCH_CHECK(qkv.is_contiguous());
+    TORCH_CHECK(cu_seqlens.is_contiguous());
+
+    TORCH_CHECK(cu_seqlens.dim() == 1);
+    TORCH_CHECK(qkv.dim() == 4);
+
+    const auto sizes = qkv.sizes();
+
+    TORCH_CHECK(sizes[THREE_DIM] == 3);
+
+    const int batch_size = cu_seqlens.numel() - 1;
+    const int num_heads = sizes[H_DIM];
+    const int head_size = sizes[D_DIM];
+    TORCH_CHECK(batch_size > 0);
+    TORCH_CHECK(head_size == 64);
+
+    auto dqkv = torch::empty_like(qkv);
+
+    Fused_multihead_attention_fprop_params params;
+
+    set_params(params,
+               batch_size,
+               seq_len,
+               num_heads,
+               head_size,
+               qkv.data_ptr(),
+               cu_seqlens.data_ptr(),
+               dout.data_ptr(),     // we set o_ptr to dout
+               softmax.data_ptr(),  // softmax gets overwritten by dP!
+               p_dropout);
+
+    // we're re-using these scales
+    Data_type acc_type = DATA_TYPE_FP32;
+    set_alpha(params.scale_bmm1, 1.f, acc_type);
+    set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
+    set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
+    params.dqkv_ptr = dqkv.data_ptr();
+
+    launch(params, stream);
+    return { dqkv, softmax };
+}
+
+std::vector<at::Tensor> mha_fwd_nl(const at::Tensor &qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
+                                const at::Tensor &cu_seqlens,  // b+1
+                                const float p_dropout,
+                                const int max_seq_len,
+                                const bool is_training,
+                                c10::optional<at::Generator> gen_) {
+    int seq_len = 512;
+    auto launch = &run_fmha_fp16_512_64_sm80_nl;
+    TORCH_CHECK(max_seq_len == seq_len);
+
+    constexpr int warps_m = 1;
+    constexpr int warps_n = 4;  // this leads to an upper bound
+    const int mmas_m = seq_len / 16 / warps_m;
+    const int mmas_n = seq_len / 16 / warps_n;
+    // static_assert( mmas_m == 32 );
+    // static_assert( mmas_n == 4 );
+    const int elts_per_thread = 8 * mmas_m * mmas_n;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    TORCH_CHECK(qkv.is_cuda())
+    TORCH_CHECK(cu_seqlens.is_cuda())
+
+    TORCH_CHECK(qkv.is_contiguous())
+    TORCH_CHECK(cu_seqlens.is_contiguous())
+
+    TORCH_CHECK(cu_seqlens.dim() == 1);
+    TORCH_CHECK(qkv.dim() == 4);
+
+    const auto sizes = qkv.sizes();
+
+    TORCH_CHECK(sizes[THREE_DIM] == 3);
+
+    const int batch_size = cu_seqlens.numel() - 1;
+    const int total = sizes[TOTAL_DIM];
+    const int num_heads = sizes[H_DIM];
+    const int head_size = sizes[D_DIM];
+    TORCH_CHECK(batch_size > 0);
+    TORCH_CHECK(head_size == 64);
+    auto opts = qkv.options();
+
+    auto ctx = torch::empty({ total, num_heads, head_size }, opts);
+
+    auto s = torch::empty({ batch_size, num_heads, seq_len, seq_len }, opts);
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    Fused_multihead_attention_fprop_params params;
+
+    set_params(params,
+               batch_size,
+               seq_len,
+               num_heads,
+               head_size,
+               qkv.data_ptr(),
+               cu_seqlens.data_ptr(),
+               ctx.data_ptr(),
+               s.data_ptr(),
+               p_dropout);
+
+    // number of times random will be generated per thread, to offset philox counter in the random
+    // state
+    int64_t counter_offset = elts_per_thread;
+    at::PhiloxCudaState rng_engine_inputs;
+
+    if( is_training ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+    int num_chunks = 3;
+    if(batch_size == 3) {
+        num_chunks = 2;
+    }
+
+    launch(params, is_training, num_chunks, stream);
+
+    return { ctx, s };
+}
+
+std::vector<at::Tensor> mha_bwd_nl(const at::Tensor &dout,        // total x num_heads, x head_size
+                                const at::Tensor &qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
+                                at::Tensor &softmax,           // b x h x s x s softmax and dmask - will be overwritten with dP
+                                const at::Tensor &cu_seqlens,  // b+1
+                                const float p_dropout,         // probability to drop
+                                const int max_seq_len          // max sequence length to choose the kernel
+) {
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    TORCH_CHECK(qkv.is_cuda())
+    TORCH_CHECK(cu_seqlens.is_cuda())
+
+    TORCH_CHECK(qkv.is_contiguous())
+    TORCH_CHECK(cu_seqlens.is_contiguous())
+
+    TORCH_CHECK(cu_seqlens.dim() == 1);
+
+    TORCH_CHECK(qkv.dim() == 4);
+
+    const auto sizes = qkv.sizes();
+
+    TORCH_CHECK(sizes[THREE_DIM] == 3);
+
+    const int batch_size = cu_seqlens.numel() - 1;
+    
+    const int total = sizes[TOTAL_DIM];
+    const int num_heads = sizes[H_DIM];
+    const int head_size = sizes[D_DIM];
+    TORCH_CHECK(batch_size > 0);
+    TORCH_CHECK(head_size == 64);
+
+    int seq_len = 512;
+    auto launch = &run_fmha_dgrad_fp16_512_64_sm80_nl;
+
+    auto opts = qkv.options();
+
+    auto dqkv = torch::empty_like(qkv);
+
+    int num_chunks = 2;
+    if( batch_size == 1 ) {
+        num_chunks = 4;
+    }else if( batch_size == 2 ) {
+        num_chunks = 3;
+    }
+    auto dkv = torch::empty({total, num_chunks, 2, num_heads, head_size}, opts);
+
+    Fused_multihead_attention_fprop_params params;
+
+    set_params(params,
+               batch_size,
+               seq_len,
+               num_heads,
+               head_size,
+               qkv.data_ptr(),
+               cu_seqlens.data_ptr(),
+               dout.data_ptr(),     // o_ptr = dout
+               softmax.data_ptr(),  // softmax gets overwritten by dP!
+               p_dropout);
+
+    params.dkv_ptr = dkv.data_ptr();
+
+    Data_type acc_type = DATA_TYPE_FP32;
+    set_alpha(params.scale_bmm1, 1.f, acc_type);
+    set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
+    set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
+    params.dqkv_ptr = dqkv.data_ptr();
+
+    launch(params, num_chunks, stream);
+
+    //SPLIT-K reduction of num_chunks dK, dV parts
+
+    // The equivalent of the following Pytorch code:
+    // using namespace torch::indexing;
+    // at::Tensor view_out = dqkv.index({Slice(), Slice(1, None, None)});
+    // torch::sum_out(view_out, dkv, 1);
+
+    const int hidden_size = num_heads * head_size;
+    fmha_run_noloop_reduce(
+        dqkv.data_ptr(), dkv.data_ptr(), cu_seqlens.data_ptr<int>(), hidden_size, batch_size, total, num_chunks, stream);
+
+    return { dqkv, softmax, dkv };
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "Fused Multi-head Self-attention for BERT";  
+    m.def("fwd", &mha_fwd, "Forward pass");
+    m.def("bwd", &mha_bwd, "Backward pass");
+    m.def("fwd_nl", &mha_fwd_nl, "Forward pass (small-batch)");
+    m.def("bwd_nl", &mha_bwd_nl, "Backward pass (small-batch)");
+}
--- a/apex/contrib/csrc/fmha/src/fmha.h
+++ b/apex/contrib/csrc/fmha/src/fmha.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+#include <ATen/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+
+#include <fmha_utils.h>
+
+
+constexpr int TOTAL_DIM = 0;
+constexpr int THREE_DIM = 1;
+constexpr int H_DIM = 2;
+constexpr int D_DIM = 3;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Qkv_params {
+    // The QKV matrices.
+    void *qkv_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    size_t qkv_stride_in_bytes;
+
+    // The number of heads.
+    int h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fused_multihead_attention_fprop_params : public Qkv_params {
+
+    // The dQKV matrices.
+    void *dqkv_ptr;
+
+    // Temporary for dKV.
+    void *dkv_ptr;
+
+    // The O matrix (output).
+    void *o_ptr;
+
+    // The stride between rows of O.
+    int64_t o_stride_in_bytes;
+
+    // The pointer to the S matrix, overwritten by the dP matrix (bwd).
+    void *s_ptr;
+    // The stride between rows of the S matrix.
+    int64_t s_stride_in_bytes;
+
+    // The dimensions.
+    int b, s, d;
+
+    // The scaling factors for the kernel.
+    uint32_t scale_bmm1, scale_softmax, scale_bmm2;
+
+    // array of length b+1 holding starting offset of each sequence.
+    int *cu_seqlens;
+
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+
+    // Scale factor of 1 / (1 - p_dropout), in half2.
+    uint32_t scale_dropout;
+
+    // Random state.
+    at::PhiloxCudaState philox_args;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_fmha_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream);
+void run_fmha_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream);
+void run_fmha_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream);
+void run_fmha_fp16_512_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream);
+
+void run_fmha_dgrad_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
+void run_fmha_dgrad_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
+void run_fmha_dgrad_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
+void run_fmha_dgrad_fp16_512_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
+
+void run_fmha_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params &params, const bool is_training, const int num_chunks, cudaStream_t stream); 
+
+void run_fmha_dgrad_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params &params, const int num_chunks, cudaStream_t stream);
+
+void fmha_run_noloop_reduce(void *out,
+                            const void *in,
+                            const int *cu_seqlens,
+                            const int hidden_size,
+                            const int batch_size,
+                            const int total,
+                            const int num_chunks,
+                            cudaStream_t stream);
+
+
--- a/apex/contrib/csrc/fmha/src/fmha/gemm.h
+++ b/apex/contrib/csrc/fmha/src/fmha/gemm.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fmha/utils.h>
+
+#define FMHA_DIV_UP(m, n) (((m) + (n)-1) / (n))
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_ >
+struct Fragment_base_ {
+
+    // The data type.
+    using Data_type = Data_type_;
+    // default input type
+    using Input_type_ = Data_type_;
+    // Does it store the array of elements.
+    enum { HAS_ELTS = BITS_PER_ELT_ >= 8 };
+    // The number of elements.
+    enum { NUM_ELTS = NUM_ELTS_ };
+    // The size of element in bits.
+    enum { BITS_PER_ELT = BITS_PER_ELT_ };
+    // The size of byte of a single register.
+    enum { BYTES_PER_REG = 4 };
+    // The size in bits.
+    enum { BITS_PER_REG = BYTES_PER_REG * 8 };
+    // The number of registers needed to store the fragment.
+    enum { NUM_REGS = Div_up<NUM_ELTS * BITS_PER_ELT, BITS_PER_REG>::VALUE };
+    // The size in bytes (as returned by sizeof(Fragment_base<>).
+    enum { SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG };
+    // The alignment.
+    enum { ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : Min<NUM_REGS * BYTES_PER_REG, 16>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The type of the elements.
+    typename Data_type_,
+    // The number of elements.
+    int NUM_ELTS_,
+    // The alignment if you want to force a value -- use 0 otherwise.
+    int ALIGNMENT_ = 0,
+    // The base class.
+    typename Base_ = Fragment_base_<Data_type_, NUM_ELTS_, 8 * sizeof(Data_type_), ALIGNMENT_>
+>
+struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment : public Base_ {
+
+    // The size of a load/store.
+    enum { BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t) };
+
+    // Clear the fragment. Using PTX in that code seems to produce better SASS...
+    inline __device__ void clear() {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) : );
+        }
+    }
+
+    // Immutable access to a register.
+    inline __device__ const uint32_t& reg(int ii) const {
+        return this->regs_[ii];
+    }
+
+    // Mutable access to a register.
+    inline __device__ uint32_t& reg(int ii) {
+        return this->regs_[ii];
+    }
+
+    uint32_t regs_[Base_::NUM_REGS];
+
+    // Immutable access to the elements.
+    inline __device__ const Data_type_& elt(int ii) const {
+        return reinterpret_cast<const Data_type_*>(&this->regs_[0])[ii];
+    }
+
+    // Mutable access to the elements.
+    inline __device__ Data_type_& elt(int ii) {
+        return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii];
+    }
+
+    // Immutable access to the elements with a cast.
+    template< typename Cast_type >
+    inline __device__ const Cast_type& elt_as(int ii) const {
+        return reinterpret_cast<const Cast_type*>(&this->regs_[0])[ii];
+    }
+
+    // Mutable access to the elements.
+    template< typename Cast_type >
+    inline __device__ Cast_type& elt_as(int ii) {
+        return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
+    }
+
+    // Add another fragment.
+    inline __device__ void add(const Fragment &other) {
+        #pragma unroll
+        for( int ii = 0; ii < NUM_ELTS_; ++ii ) {
+            this->elt(ii) += other.elt(ii);
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout >
+struct Fragment_a : public Fragment<uint16_t, 8> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout >
+struct Fragment_b : public Fragment<uint16_t, 8> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fragment_accumulator : public Fragment<float, 8> {
+
+    // The base class.
+    using Base = Fragment<float, 8>;
+
+    // Add two fragments.
+    template< typename Other_fragment_ >
+    inline __device__ void add(const Other_fragment_ &other) {
+        for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
+            this->elt(ii) = this->elt(ii) + other.elt(ii);
+        }
+    }
+
+    // Do the HMMA.
+    template< typename Layout_a, typename Layout_b >
+    inline __device__ void mma(const Fragment_a<Layout_a> &a,
+                               const Fragment_b<Layout_b> &b) {
+        asm volatile( \
+            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
+            "    {%0, %1, %2, %3}, \n" \
+            "    {%4, %5, %6, %7}, \n" \
+            "    {%8, %9}, \n" \
+            "    {%0, %1, %2, %3}; \n" \
+                    : "+f"(  elt(0)), "+f"(  elt(1)), "+f"(  elt(2)), "+f"(  elt(3))
+                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
+                    ,  "r"(b.reg(0)),  "r"(b.reg(1)));
+        asm volatile( \
+            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
+            "    {%0, %1, %2, %3}, \n" \
+            "    {%4, %5, %6, %7}, \n" \
+            "    {%8, %9}, \n" \
+            "    {%0, %1, %2, %3}; \n" \
+                    : "+f"(  elt(4)), "+f"(  elt(5)), "+f"(  elt(6)), "+f"(  elt(7))
+                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
+                    ,  "r"(b.reg(2)),  "r"(b.reg(3)));
+    }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Fragment, int M, int N >
+inline __device__ void clear(Fragment (&frag)[M][N]) {
+    #pragma unroll
+    for( int mi = 0; mi < M; ++mi ) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ++ni ) {
+            frag[mi][ni].clear();
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Accumulator_type, int WARPS_K >
+struct Clear_accumulator {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int WARPS_K >
+struct Clear_accumulator<float, WARPS_K> {
+  template< typename Acc, int M, int N >
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
+
+    #pragma unroll
+    for( int mi = 0; mi < M; ++mi ) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ++ni ) {
+            acc[mi][ni].mma(a[mi], b[ni]);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The number of rows in the CTA tile.
+    int M_,
+    // The number of cols in the CTA tile.
+    int N_,
+    // The number of elements in the the K dimension of the GEMM loop.
+    int K_,
+    // The number of rows of warps.
+    int WARPS_M_,
+    // The number of cols of warps.
+    int WARPS_N_,
+    // The number of warps in the K dimension of the GEMM loop.
+    int WARPS_K_>
+struct Cta_tile_ {
+
+    enum { M = M_, N = N_, K = K_ };
+    // The number of warps.
+    enum { WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_ };
+    // The number of warps per CTA.
+    enum { WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K };
+    // The number of threads per warp.
+    enum { THREADS_PER_WARP = 32 };
+    // The number of threads per CTA.
+    enum { THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile>
+struct Hmma_tile {
+    // The number of elements computed with a single warp-MMA.
+    enum { M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16 };
+
+    // The number of elements computed with a single CTA-MMA.
+    enum {
+        M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
+        N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
+        K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K
+    };
+
+    // The number of MMAs needed to compute the GEMM.
+    enum {
+        MMAS_M = Div_up<Cta_tile::M, M_PER_MMA_PER_CTA>::VALUE,
+        MMAS_N = Div_up<Cta_tile::N, N_PER_MMA_PER_CTA>::VALUE,
+        MMAS_K = Div_up<Cta_tile::K, K_PER_MMA_PER_CTA>::VALUE,
+    };
+
+    // The number of elements computed per warp.
+    enum {
+        M_PER_WARP = MMAS_M * M_PER_MMA,
+        N_PER_WARP = MMAS_N * N_PER_MMA,
+        K_PER_WARP = MMAS_K * K_PER_MMA,
+    };
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using A_type = uint16_t;
+using B_type = uint16_t;
+using C_type = uint16_t;
+using Accumulator_type = float;
+using Epilogue_type = float;
+
+constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
+constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
+constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
+using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile_>
+using Cta_tile_with_k_with_padding = Cta_tile_extd<Cta_tile_::M,
+                                                   Cta_tile_::N,
+                                                   Next_power_of_two<Cta_tile_::K>::VALUE,
+                                                   Cta_tile_::WARPS_M,
+                                                   Cta_tile_::WARPS_N,
+                                                   Cta_tile_::WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
--- a/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
+++ b/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS,
+    // The number of columns.
+    int COLS,
+    // The number of matrics.
+    int NUM_MATS = 3
+>
+struct Gmem_tile_qkv {
+
+    // The size of each LDG.
+    enum { BYTES_PER_LDG = 16 };
+    // The size of a row in bytes.
+    enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+    // The number of threads to load a "row" of the matrix.
+    enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+    // The number of "rows" loaded per LDG.
+    enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+    // The number of LDGs needed to load a chunk of the Q matrix.
+    enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+    // Ctor.
+    template< typename Params, typename BInfo >
+    inline __device__ Gmem_tile_qkv(const Params &params, int qkv_offset, const BInfo &binfo, int tidx)
+        : params_qkv_stride_in_bytes_(params.qkv_stride_in_bytes)
+        , actual_seqlen(binfo.actual_seqlen)
+        , qkv_ptr_(reinterpret_cast<char *>(params.qkv_ptr)) {
+
+        // Compute the position in the sequence (within the CTA for the moment).
+        int row = tidx / THREADS_PER_ROW;
+        // Compute the position of the thread in the row.
+        int col = tidx % THREADS_PER_ROW;
+
+        // Store the row as we need it to disable the loads.
+        row_ = row;
+
+        // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
+        int64_t row_offset = (int64_t)row * params.qkv_stride_in_bytes;
+        // Add the block index.
+        row_offset += (int64_t)((binfo.sum_s * NUM_MATS + qkv_offset) * binfo.h + binfo.bidh) * BYTES_PER_ROW;
+
+        // Assemble the final pointer.
+        qkv_ptr_ += row_offset + col * BYTES_PER_LDG;
+    }
+
+    // Store data to shared memory.
+    template< typename Smem_tile >
+    inline __device__ void commit(Smem_tile &smem_tile) {
+        smem_tile.store(fetch_);
+    }
+
+    // Load data from memory.
+    template< typename Smem_tile >
+    inline __device__ void load(Smem_tile &smem_tile) {
+        const void *ptrs[LDGS];
+        uint32_t preds[LDGS];
+        #pragma unroll
+        for( int ii = 0; ii < LDGS; ++ii ) {
+            ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+            preds[ii] = ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen));
+            fetch_[ii] = make_uint4(0, 0, 0, 0);
+        }
+
+        // not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
+        Ldg_functor<uint4, LDGS> fct(fetch_, ptrs);
+        #pragma unroll
+        for( int ii = 0; ii < LDGS; ++ii ) {
+            fct.load(ii, preds[ii]);
+        }
+    }
+
+    // Store data to memory.
+    inline __device__ void store(const uint4 (&data)[LDGS]) {
+        #pragma unroll
+        for( int ii = 0; ii < LDGS; ++ii ) {
+            char *ptr = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+            if( (row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen) ) {
+                fmha::stg(ptr, data[ii]);
+            }
+        }
+    }
+
+    // Move the pointer to the next location.
+    inline __device__ void move() {
+        qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_;
+        actual_seqlen -= ROWS;
+    }
+
+    // The stride between rows for the QKV matrice.
+    int64_t params_qkv_stride_in_bytes_;
+    // The pointer.
+    char *qkv_ptr_;
+    // The fetch registers.
+    uint4 fetch_[LDGS];
+    // Keep track of the row the thread is processing as we move the tile.
+    int row_;
+    // The length of the sequence loaded by that memory tile.
+    int actual_seqlen;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Cta_tile >
+struct Gmem_tile_o {
+
+    // The mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // The size of each element.
+    enum { BYTES_PER_ELEMENT = 2 };
+    // The size of a row in bytes.
+    enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+    // The number of threads to store a "row" of the matrix.
+    enum { THREADS_PER_ROW = 16 };
+    // The size of each STG.
+    enum { BYTES_PER_STG = BYTES_PER_ROW / THREADS_PER_ROW };
+
+    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+    enum { ROWS = Cta_tile::M };
+    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+    enum { ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA };
+    // The number of outter loop for the stores.
+    enum { LOOPS = ROWS / ROWS_PER_LOOP };
+
+    // The number of "rows" stored per STG.
+    enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+    // Do we have to guard against partial writes/reads.
+    enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
+    // The number of STGs needed to store a chunk of the Q matrix.
+    enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+    // The number of STGs needed to store a chunk of the Q matrix in total.
+    enum { STGS = STGS_PER_LOOP * LOOPS };
+
+    // Ctor.
+    template<typename Params, typename BInfo>
+    inline __device__ Gmem_tile_o(const Params &params, const BInfo &binfo, int tidx)
+        : params_o_stride_in_bytes_(params.o_stride_in_bytes)
+        , actual_seqlen_(binfo.actual_seqlen)
+        , o_ptr_(reinterpret_cast<char *>(params.o_ptr)) {
+
+        // Compute the position in the sequence (within the CTA for the moment).
+        int row = tidx / THREADS_PER_ROW;
+        // Compute the position of the thread in the row.
+        int col = tidx % THREADS_PER_ROW;
+
+        // Store the row as we need it to disable loads.
+        row_ = row;
+
+        // The row offset in the batched GEMM.
+        int64_t row_offset = (int64_t)row * params.o_stride_in_bytes + binfo.bidx * BYTES_PER_ROW;
+        // Assemble the final pointer.
+        o_ptr_ += row_offset + col * BYTES_PER_STG;
+
+        // Is that thread active on the last STG?
+        if( HAS_INCOMPLETE_STG ) {
+            is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+        }
+    }
+
+    // Store data to global memory.
+    inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
+
+        #pragma unroll
+        for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
+            int jj = mi * STGS_PER_LOOP + ii;
+            if( this->row_ + jj * ROWS_PER_STG >= this->actual_seqlen_ ) {
+                break;
+            }
+
+            float x = reinterpret_cast<const float &>(src[ii].x);
+            float y = reinterpret_cast<const float &>(src[ii].y);
+            float z = reinterpret_cast<const float &>(src[ii].z);
+            float w = reinterpret_cast<const float &>(src[ii].w);
+            uint2 out = float4_to_half4(x, y, z, w);
+            if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
+                fmha::stg(this->o_ptr_ + jj * ROWS_PER_STG * this->params_o_stride_in_bytes_, out);
+            }
+        }
+    }
+
+    // Move the pointer to the next location.
+    inline __device__ void move() {
+        row_ += ROWS;
+        o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
+    }
+
+    // The stride between rows for the QKV matrice.
+    int64_t params_o_stride_in_bytes_;
+    // The pointer.
+    char *o_ptr_;
+    // Is the thread active for the last STG?
+    int is_active_for_last_stg_;
+    // Keep track of the row to disable loads.
+    int row_;
+    // The length of the sequence loaded by that memory tile.
+    int actual_seqlen_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Cta_tile, int BYTES_PER_ELEMENT >
+struct Gmem_tile_mma_sd {
+
+    // The mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // Each STG stores 8 elements.
+    enum { BYTES_PER_STG = BYTES_PER_ELEMENT * 8 };
+    // The number of MMAs in the M dimension.
+    enum { MMAS_M = Mma_tile::MMAS_M };
+    // The number of MMAs in the N dimension.
+    enum { MMAS_N = Mma_tile::MMAS_N };
+    // The number of rows computed per MMA per thread block.
+    enum { M_PER_MMA_PER_CTA = Mma_tile::M_PER_MMA_PER_CTA };
+    // The number of cols computed per MMA per thread block.
+    enum { N_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA };
+    // The number of threads per block.
+    enum { THREADS_PER_CTA = Cta_tile::THREADS_PER_CTA };
+    // The size of each row in bytes. I.e. how many bytes are stored per STG.
+    enum { BYTES_PER_ROW = THREADS_PER_CTA * BYTES_PER_STG };
+    // The fixed sequence length.
+    enum { SEQLEN = Cta_tile::N };
+    // The distance between two blocks (in bytes).
+    enum { BLOCK_STRIDE_BYTES = SEQLEN * SEQLEN * BYTES_PER_ELEMENT };
+    // The distance between elements stored per loop (in bytes).
+    enum { LOOP_STRIDE_BYTES = MMAS_M * MMAS_N * BYTES_PER_ROW };
+
+    // The type of elements stored per STG.
+    using Type = typename fmha::Uint_from_size_in_bytes<BYTES_PER_STG>::Type;
+
+    // Ctor.
+    template<typename Params>
+    inline __device__ Gmem_tile_mma_sd(void *ptr, const Params &params, const int tidx) 
+        : ptr_(static_cast<char *>(ptr)) {
+
+        // The block index for the batch.
+        const int bidb = blockIdx.y;
+        // The block index for the head.
+        const int bidh = blockIdx.x;
+        // The block index.
+        size_t bidx = bidb * params.h + bidh;
+
+        // Set store location for each thread at the beginning of the loop
+        ptr_ += bidx * BLOCK_STRIDE_BYTES + tidx * BYTES_PER_STG;
+    }
+
+    // Store to global memory.
+    inline __device__ void store(const Type &data, const int mi, const int ni) {
+        size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+        fmha::stg(ptr_ + offset, data);
+    }
+
+    // Load from global memory.
+    inline __device__ void load(Type &data, const int mi, const int ni) {
+        size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+        fmha::ldg(data, ptr_ + offset);
+    }
+
+    // Move to the next tile.
+    inline __device__ void move() {
+        ptr_ += LOOP_STRIDE_BYTES;
+    }
+
+    // The pointer in global memory.
+    char *ptr_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Cta_tile, typename Base = Gmem_tile_mma_sd<Cta_tile, sizeof(uint16_t)> >
+struct Gmem_tile_mma_s : public Base {
+
+    // The number of mmas in the vertical dimension.
+    enum { M = Base::MMAS_M };
+    // The number of mmas in the horizontal dimension.
+    enum { N = Base::MMAS_N };
+    // The type of the vectors stored by each STG.
+    using Type = typename Base::Type;
+
+    // Ctor.
+    template< typename Params >
+    inline __device__ Gmem_tile_mma_s(void *ptr, const Params &params, const int tidx) 
+        : Base(ptr, params, tidx) {
+    }
+
+    // Store to global memory.
+    template<typename Mask>
+    inline __device__ void store(const float (&softmax)[2 * M][4 * N], const Mask &mask) {
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+
+                float tmp00 = softmax[2 * mi + 0][4 * ni + 0];
+                float tmp01 = softmax[2 * mi + 0][4 * ni + 1];
+                float tmp02 = softmax[2 * mi + 0][4 * ni + 2];
+                float tmp03 = softmax[2 * mi + 0][4 * ni + 3];
+
+                float tmp10 = softmax[2 * mi + 1][4 * ni + 0];
+                float tmp11 = softmax[2 * mi + 1][4 * ni + 1];
+                float tmp12 = softmax[2 * mi + 1][4 * ni + 2];
+                float tmp13 = softmax[2 * mi + 1][4 * ni + 3];
+
+                uint4 dst;
+                dst.x = fmha::float2_to_half2(tmp00, tmp01);
+                dst.y = fmha::float2_to_half2(tmp02, tmp03);
+                dst.z = fmha::float2_to_half2(tmp10, tmp11);
+                dst.w = fmha::float2_to_half2(tmp12, tmp13);
+                if( mask.is_valid(mi, ni, 0, 0) ) {
+                    Base::store(dst, mi, ni);
+                }
+            }
+        }
+    }
+
+    // Load from global memory.
+    template<typename Mask>
+    inline __device__ void load(uint4 (&regs)[M][N], const Mask &mask) {
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+                regs[mi][ni] = make_uint4(0, 0, 0, 0);
+                if( mask.is_valid(mi, ni, 0, 0) ) {
+                    Base::load(regs[mi][ni], mi, ni);
+                }
+            }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The base class.
+    typename Base = fmha::Gmem_tile_qkv<Cta_tile, fmha::BITS_PER_ELEMENT_A, Cta_tile::M, Cta_tile::K>
+>
+struct Gmem_tile_dout : public Base {
+
+    // Ctor.
+    template<typename Params, typename BInfo>
+    inline __device__ Gmem_tile_dout(const Params &params, const BInfo &binfo, int tidx)
+        : Base(params, 0, binfo, tidx) {
+
+        this->qkv_ptr_ = reinterpret_cast<char *>(params.o_ptr);
+        this->params_qkv_stride_in_bytes_ = params.o_stride_in_bytes;  // needed for move
+
+        // Compute the position of the thread in the row.
+        int col = tidx % Base::THREADS_PER_ROW;
+
+        // The row offset in the batched GEMM. For each seq element, we store O in that order.
+        int64_t row_offset = (int64_t)this->row_ * params.o_stride_in_bytes + binfo.bidx * Base::BYTES_PER_ROW;
+
+        // Assemble the final pointer.
+        this->qkv_ptr_ += row_offset + col * Base::BYTES_PER_LDG;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Cta_tile, typename Base = fmha::Gmem_tile_o<Cta_tile> >
+struct Gmem_tile_dq : public Base {
+
+    // Ctor.
+    template<typename Params, typename BInfo>
+    inline __device__ Gmem_tile_dq(const Params &params, const BInfo &binfo, int tidx) 
+        : Base(params, binfo, tidx) {
+        this->o_ptr_ = reinterpret_cast<char *>(params.dqkv_ptr);
+        this->params_o_stride_in_bytes_ = params.qkv_stride_in_bytes;  // needed for move
+
+        // Compute the position of the thread in the row.
+        int col = tidx % Base::THREADS_PER_ROW;
+
+        // The row offset in the batched GEMM. For each seq element, we store O in that order.
+        int64_t row_offset = (int64_t)this->row_ * params.qkv_stride_in_bytes +
+                             (binfo.sum_s * 3 * binfo.h + binfo.bidh) * Base::BYTES_PER_ROW;
+
+        // Assemble the final pointer.
+        this->o_ptr_ += row_offset + col * Base::BYTES_PER_STG;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
+
--- a/apex/contrib/csrc/fmha/src/fmha/kernel_traits.h
+++ b/apex/contrib/csrc/fmha/src/fmha/kernel_traits.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int S, int D, int STEP, int WARPS_M, int WARPS_N, uint32_t FLAGS = 0x8u>
+struct FMHA_kernel_traits {
+
+    // The CTA description for the 1st GEMM.
+    using Cta_tile_p = fmha::Cta_tile_extd<STEP, S, D, WARPS_M, WARPS_N, 1>;
+    // The CTA description for the 2nd GEMM.
+    using Cta_tile_o = fmha::Cta_tile_extd<STEP, D, S, WARPS_M, 1, WARPS_N>;
+
+    // Do we use one buffer for K and V.
+    enum { SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x8u) != 0u };
+
+    // The global memory tile to load Q.
+    using Gmem_tile_q = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;
+
+    // The shared memory tile to swizzle Q.
+    using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 1>;
+
+    // The global memory tile to load K.
+    using Gmem_tile_k = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_B, S, D>;
+    // The shared memory tile to swizzle K.
+    using Smem_tile_k = fmha::Smem_tile_b<Cta_tile_p, fmha::Col>;
+
+    // The global memory tile to load V.
+    using Gmem_tile_v = fmha::Gmem_tile_qkv<Cta_tile_o, fmha::BITS_PER_ELEMENT_B, S, D>;
+    // The shared memory tile to swizzle V.
+    using Smem_tile_v = fmha::Smem_tile_v<Cta_tile_o>;
+
+    // The global memory tile to store O.
+    using Gmem_tile_o = fmha::Gmem_tile_o<Cta_tile_o>;
+    // The shared memory tile for O.
+    using Smem_tile_o = fmha::Smem_tile_o<Cta_tile_o>;
+
+    // The global memory tile to load/store S.
+    using Gmem_tile_s = fmha::Gmem_tile_mma_s<Cta_tile_p>;
+
+    // The shared memory tile to transpose S.
+    using Smem_tile_st = fmha::Smem_tile_mma_transposed<Cta_tile_p>;
+
+    using Gmem_tile_do = fmha::Gmem_tile_dout<Cta_tile_p>;
+
+    // Make sure the number of threads match.
+    static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
+
+    // The number of threads.
+    enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
+    // Make sure the number of threads matches both CTAs.
+    static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");
+
+    // The amount of shared memory needed to load Q and K.
+    enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
+    // The extra amount of shared memory needed to load V.
+    enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
+    // The amount of shared memory needed for Q, K and V..
+    enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
+    // The amount of shared memory needed to load Q and store O.
+    enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE };
+
+    // The amount of shared memory needed for Q, K, V and O.
+    enum { BYTES_PER_SMEM = fmha::Max<BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO>::VALUE };
+    // Make sure we have enough shared memory.
+    static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/apex/contrib/csrc/fmha/src/fmha/mask.h
+++ b/apex/contrib/csrc/fmha/src/fmha/mask.h
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+namespace fmha {
+
+
+template<typename Cta_tile>
+struct Mask {
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    template<typename Params, typename BInfo>
+    __device__ Mask(const Params &params, const BInfo &blockInfo, int tidx) {
+
+        actual_seqlen = blockInfo.actual_seqlen;
+
+        const int warp = tidx / Cta_tile::THREADS_PER_WARP;
+        const int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+        static_assert(Cta_tile::WARPS_K == 1, "");
+
+        // find the warp in the Cta tile
+        const int warp_n = (warp / Cta_tile::WARPS_M);
+        const int warp_m = (warp % Cta_tile::WARPS_M);
+        // decompose warp into 8x4 tile
+        const int quad = lane / 4;
+        const int tid = (lane % 4) * 2;
+        row = warp_m * 16 + quad;
+        col = warp_n * 16 + tid;
+    }
+
+    inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
+
+        // ii and jj iterate over the 2x4 fragment
+        const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen;
+        //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen;
+        return col_valid;
+        // return row_valid && col_valid;
+    }
+
+    inline __device__ void load(int it) {
+        row_offset = it * Cta_tile::M + row;
+    }
+    int row_offset;
+
+    int row;
+    int col;
+    int actual_seqlen;
+};
+
+}  // namespace fmha
--- a/apex/contrib/csrc/fmha/src/fmha/smem_tile.h
+++ b/apex/contrib/csrc/fmha/src/fmha/smem_tile.h
--- a/apex/contrib/csrc/fmha/src/fmha/softmax.h
+++ b/apex/contrib/csrc/fmha/src/fmha/softmax.h
--- a/apex/contrib/csrc/fmha/src/fmha/utils.h
+++ b/apex/contrib/csrc/fmha/src/fmha/utils.h
--- a/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu
+++ b/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu