Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29
IFU-master-2022-07-29
96850dfa · Jithun Nair · GitHub · 87fc4125 · cc5f83b5 · 96850dfa
Unverified Commit 96850dfa authored Aug 15, 2022 by Jithun Nair Committed by GitHub Aug 15, 2022
20 changed files
--- a/apex/contrib/test/clip_grad/test_clip_grad.py
+++ b/apex/contrib/test/clip_grad/test_clip_grad.py
+import random
+import unittest
+import torch
+from apex.contrib.clip_grad import clip_grad_norm_
+def make_params(
+        num_params,
+        sizes=[1,2,3,4,5],
+        num_dims=[1,2,3],
+        dtypes=[torch.float32],
+        devices=['cuda'],
+        make_copy=False,
+):
+    """Construct parameters with random configurations"""
+    # Construct parameters
+    params = []
+    for _ in range(num_params):
+        dims = [random.choice(sizes) for _ in range(random.choice(num_dims))]
+        dtype = random.choice(dtypes)
+        device = random.choice(devices)
+        p = torch.nn.Parameter(torch.randn(dims, dtype=dtype, device=device))
+        p.grad = torch.randn_like(p)
+        params.append(p)
+    # Copy parameters if needed
+    if make_copy:
+        params_copy = []
+        for p in params:
+            p_copy = p.clone().detach()
+            p_copy.grad = p.grad.clone().detach()
+            params_copy.append(p_copy)
+        return params, params_copy
+    else:
+        return params
+class ClipGradNormTest(unittest.TestCase):
+    def setUp(self, seed=1234):
+        random.seed(seed)
+        torch.manual_seed(seed)
+    def test_matches_pytorch(
+            self,
+            num_params=41,
+            dtypes=[torch.float32, torch.float16, torch.float64],
+            devices=['cuda', 'cpu'],
+            max_norm=0.54321,
+            norm_type=2.0,
+            rtol=1e-3,
+            atol=1e-20,
+    ):
+        """Make sure PyTorch and Apex gradient clipping produce same results"""
+        # Construct identical sets of parameters
+        torch_params, apex_params = make_params(
+            num_params,
+            dtypes=dtypes,
+            devices=devices,
+            make_copy=True,
+        )
+        # Apply gradient clipping
+        torch_norm = torch.nn.utils.clip_grad_norm_(
+            torch_params,
+            max_norm,
+            norm_type=norm_type,
+        )
+        apex_norm = clip_grad_norm_(
+            apex_params,
+            max_norm,
+            norm_type=norm_type,
+        )
+        # Make sure PyTorch and Apex get same results
+        torch.testing.assert_close(
+            apex_norm, torch_norm,
+            rtol=rtol,
+            atol=atol,
+            check_dtype=False,
+        )
+        for torch_p, apex_p in zip(torch_params, apex_params):
+            torch.testing.assert_close(
+                apex_p, torch_p,
+                rtol=0,
+                atol=0,
+            ) # Params should be unaffected
+            torch.testing.assert_close(
+                apex_p.grad, torch_p.grad,
+                rtol=rtol,
+                atol=atol,
+            )
+    def test_matches_pytorch_fp16(self):
+        self.test_matches_pytorch(num_params=11, dtypes=[torch.float16])
+    def test_matches_pytorch_fp32(self):
+        self.test_matches_pytorch(dtypes=[torch.float32], rtol=1e-6)
+    def test_matches_pytorch_fp64(self):
+        self.test_matches_pytorch(dtypes=[torch.float64], rtol=1e-15)
+    def test_matches_pytorch_cpu(self):
+        self.test_matches_pytorch(devices=['cpu'])
+    def test_matches_pytorch_infnorm(self):
+        self.test_matches_pytorch(norm_type=float('inf'))
+    def test_matches_pytorch_1norm(self):
+        self.test_matches_pytorch(norm_type=1.0)
+    def test_raises_on_mismatch(self):
+        # Construct different sets of parameters
+        torch_params, apex_params = make_params(7, make_copy=True)
+        with torch.no_grad():
+            torch_params[0].grad.view(-1)[0] = 1.23
+            apex_params[0].grad.view(-1)[0] = 3.21
+        # Apply gradient clipping
+        torch_norm = torch.nn.utils.clip_grad_norm_(
+            torch_params,
+            0.54321,
+        )
+        apex_norm = clip_grad_norm_(
+            apex_params,
+            0.54321,
+        )
+        # Make sure PyTorch and Apex get different results
+        self.assertRaises(
+            AssertionError,
+            torch.testing.assert_close,
+            apex_norm, torch_norm,
+            rtol=1e-3,
+            atol=1e-20,
+            check_dtype=False,
+        )
+        for torch_p, apex_p in zip(torch_params, apex_params):
+            self.assertRaises(
+                AssertionError,
+                torch.testing.assert_close,
+                apex_p.grad, torch_p.grad,
+                rtol=1e-3,
+                atol=1e-20,
+            )
+    def test_raises_on_nan(self):
+        params = make_params(5, num_dims=[1])
+        params[2].grad[-1] = float('NaN')
+        self.assertRaises(
+            RuntimeError, clip_grad_norm_, params, 1.0, error_if_nonfinite=True)
+    def test_raises_on_inf(self):
+        params = make_params(5, num_dims=[1])
+        params[2].grad[-1] = float('inf')
+        self.assertRaises(
+            RuntimeError, clip_grad_norm_, params, 1.0, error_if_nonfinite=True)
+if __name__ == "__main__":
+    unittest.main()
--- a/apex/contrib/test/conv_bias_relu/test_conv_bias_relu.py
+++ b/apex/contrib/test/conv_bias_relu/test_conv_bias_relu.py
+import copy
+import math
+import random
+import unittest
+import torch
+import torch.nn.functional as F
+HAS_CONV_BIAS_RELU = None
+try:
+    from apex.contrib.conv_bias_relu import ConvBiasReLU, ConvBias, ConvBiasMaskReLU
+except ImportError as e:
+    HAS_CONV_BIAS_RELU = False
+else:
+    HAS_CONV_BIAS_RELU = True
+@unittest.skipIf(not HAS_CONV_BIAS_RELU, "`apex.contrib.conv_bias_relu` is not found.")
+class FusedDenseTest(unittest.TestCase):
+    def setUp(self, seed=0):
+        torch.manual_seed(seed)
+        self.batch_size = random.randint(1, 64)
+        self.in_channels = random.randint(1, 64) * 8
+        self.out_channels = random.randint(1, 64) * 8
+        self.in_height = self.in_width = random.randint(5, 100)
+        self.conv_kernel_size = random.randint(1, 5)
+        self.conv_pad = random.randint(0, int(self.conv_kernel_size / 2))
+        self.conv_stride = random.randint(1, 5)
+        self.conv_dilation = 1
+        self.out_height = self.out_width = \
+            math.floor((self.in_height + 2 * self.conv_pad - \
+                        self.conv_dilation * (self.conv_kernel_size - 1) - 1) / self.conv_stride + 1)
+        self.x = torch.randint(low=-16, high=16,
+                               size=[self.batch_size, self.in_channels, self.in_height, self.in_width]) \
+                               .cuda().to(memory_format=torch.channels_last).float()
+        self.x_ = self.x.clone()
+        self.x.requires_grad_()
+        self.x_.requires_grad_()
+        self.mask = torch.randn([self.batch_size, self.out_channels, self.out_height, self.out_width]).cuda().to(memory_format=torch.channels_last)
+        self.mask = (self.mask > 0).to(torch.int8)
+        self.mask_ = self.mask.clone()
+        self.conv1 = torch.nn.Conv2d(self.in_channels, self.out_channels, self.conv_kernel_size,
+                                     stride=self.conv_stride, padding=self.conv_pad).cuda().to(memory_format=torch.channels_last)
+        self.conv1_ = copy.deepcopy(self.conv1)
+        print()
+        print('> input=[{}, {}, {}, {}]'.format(self.batch_size, self.in_channels, self.in_height, self.in_width))
+        print('> kernel=[{}, {}, {}, {}], stride={}, pad={}'.format(self.out_channels, self.in_channels,
+                                                                    self.conv_kernel_size, self.conv_kernel_size,
+								    self.conv_stride, self.conv_pad))
+    def test_conv_bias_relu(self):
+        with torch.cuda.amp.autocast(dtype=torch.half):
+            out = ConvBiasReLU(self.x, self.conv1.weight, self.conv1.bias.reshape(1, -1, 1, 1), self.conv_pad, self.conv_stride)
+            loss = (out.float()**2).sum() / out.numel()
+        loss.backward()
+        with torch.cuda.amp.autocast(dtype=torch.half):
+            out_ = F.relu(self.conv1_(self.x_))
+            loss_ = (out_**2).sum() / out_.numel()
+        loss_.backward()
+        self.assertTrue(torch.allclose(self.x_, self.x, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.conv1_.bias.grad, self.conv1.bias.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.conv1_.weight.grad, self.conv1.weight.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+    def test_conv_bias(self):
+        with torch.cuda.amp.autocast(dtype=torch.half):
+            out = ConvBias(self.x, self.conv1.weight, self.conv1.bias.reshape(1, -1, 1, 1), self.conv_pad, self.conv_stride)
+            loss = (out.float()**2).sum() / out.numel()
+        loss.backward()
+        with torch.cuda.amp.autocast(dtype=torch.half):
+            out_ = self.conv1_(self.x_)
+            loss_ = (out_**2).sum() / out_.numel()
+        loss_.backward()
+        self.assertTrue(torch.allclose(self.x_, self.x, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.conv1_.bias.grad, self.conv1.bias.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.conv1_.weight.grad, self.conv1.weight.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+    def test_conv_bias_mask_relu(self):
+        with torch.cuda.amp.autocast(dtype=torch.half):
+            out = ConvBiasMaskReLU(self.x, self.conv1.weight, self.conv1.bias.reshape(1, -1, 1, 1), self.mask, self.conv_pad, self.conv_stride)
+            loss = (out.float()**2).sum() / out.numel()
+        loss.backward()
+        with torch.cuda.amp.autocast(dtype=torch.half):
+            out_ = F.relu(self.conv1_(self.x_) * self.mask_)
+            loss_ = (out_**2).sum() / out_.numel()
+        loss_.backward()
+        self.assertTrue(torch.allclose(self.x_, self.x, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.conv1_.bias.grad, self.conv1.bias.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.conv1_.weight.grad, self.conv1.weight.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+        self.assertTrue(torch.allclose(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
+if __name__ == '__main__':
+    unittest.main()
--- a/apex/contrib/test/fmha/test_fmha.py
+++ b/apex/contrib/test/fmha/test_fmha.py
@@ -25,15 +25,22 @@
 #
 ###############################################################################
+import math
 import sys
+import unittest
 import torch
 import numpy as np
-import unittest
-import math
 import fmhalib as mha
+def _get_device_properties(device = torch.device("cuda")):
+    # type: (str or torch.device) -> Tuple[int, int]
+    properties = torch.cuda.get_device_properties(device)
+    return properties.major, properties.minor
 def py_mha(qkv, amask, b, s, h, d):
    qkv = qkv.view(b, s, h, 3, d)
    q = qkv[:, :, :, 0, :].permute(0,2,1,3)
@@ -49,10 +56,11 @@ def py_mha(qkv, amask, b, s, h, d):
    return ctx
+@unittest.skipIf(not _get_device_properties() == (8, 0), "FMHA only supports sm80")
 class TestFMHA(unittest.TestCase):
-    def run_test(self, s, b):
+    def run_test(self, s: int, b: int, zero_tensors: bool):
-        print(f'Test s={s} b={b}')
+        print(f'Test s={s} b={b}, zero_tensors={zero_tensors}')
        torch.manual_seed(1234)
        torch.cuda.manual_seed(1234)
@@ -77,9 +85,9 @@ class TestFMHA(unittest.TestCase):
        qkv.requires_grad = True
        if b < 4:
-            ctx, S_ = mha.fwd_nl(qkv_vs, cu_seqlens, 0.0, s, True, None)
+            ctx, S_ = mha.fwd(qkv_vs, cu_seqlens, 0.0, s, True, True, zero_tensors, None)
        else:
-            ctx, S_ = mha.fwd(qkv_vs, cu_seqlens, 0.0, s, True, None)
+            ctx, S_ = mha.fwd(qkv_vs, cu_seqlens, 0.0, s, True, False, zero_tensors, None)
        ctx = ctx.view(b,s,h,d)
        ctx_ref = py_mha(qkv, amask, b,s,h,d)
@@ -95,27 +103,34 @@ class TestFMHA(unittest.TestCase):
        dw2 = dw.permute(0,2,1,3).clone().detach().contiguous()
        if b < 4:
-            dqkv2, _, _ = mha.bwd_nl(dw2, qkv_vs, S_, cu_seqlens, 0.0, s)
+            dqkv2, _, _ = mha.bwd_nl(dw2, qkv_vs, S_, cu_seqlens, 0.0, s, zero_tensors)
        else:
-            dqkv2, _ = mha.bwd(dw2, qkv_vs, S_, cu_seqlens, 0.0, s)
+            dqkv2, _ = mha.bwd(dw2, qkv_vs, S_, cu_seqlens, 0.0, s, zero_tensors)
        dqkv2 = dqkv2.permute(0,2,1,3).view(b,s, h,3,d)
        self.assertTrue(torch.allclose(qkv.grad.float(), dqkv2.float(), atol=1e-3))
    def test_128(self):
-        self.run_test(128, 32)
+        self.run_test(128, 32, False)
+        self.run_test(128, 32, True)
    def test_256(self):
-        self.run_test(256, 32)
+        self.run_test(256, 32, False)
+        self.run_test(256, 32, True)
    def test_384(self):
-        self.run_test(384, 32)
+        self.run_test(384, 32, False)
+        self.run_test(384, 32, True)
    def test_512(self):
-        self.run_test(512, 32)
+        self.run_test(512, 32, False)
-        self.run_test(512, 2)
+        self.run_test(512, 32, True)
-        self.run_test(512, 3)
+        self.run_test(512, 2, False)
+        self.run_test(512, 2, True)
+        self.run_test(512, 3, False)
+        self.run_test(512, 3, True)
 if __name__ == '__main__':
    unittest.main()
--- a/apex/contrib/test/focal_loss/test_focal_loss.py
+++ b/apex/contrib/test/focal_loss/test_focal_loss.py
+import unittest
+import torch
+import torch.nn.functional as F
+reference_available = True
+try:
+    from torchvision.ops.focal_loss import sigmoid_focal_loss
+except ImportError:
+    reference_available = False
+from apex.contrib.focal_loss import focal_loss
+@unittest.skipIf(not reference_available, "Reference implementation `torchvision.ops.focal_loss.sigmoid_focal_loss` is not available.")
+class FocalLossTest(unittest.TestCase):
+    N_SAMPLES = 12
+    N_CLASSES = 8
+    ALPHA = 0.24
+    GAMMA = 2.0
+    REDUCTION = "sum"
+    def test_focal_loss(self) -> None:
+        if not reference_available:
+            self.skipTest("This test needs `torchvision` for `torchvision.ops.focal_loss.sigmoid_focal_loss`.")
+        else:
+            x = torch.randn(FocalLossTest.N_SAMPLES, FocalLossTest.N_CLASSES).cuda()
+            with torch.no_grad():
+                x_expected = x.clone()
+                x_actual = x.clone()
+            x_expected.requires_grad_()
+            x_actual.requires_grad_()
+            classes = torch.randint(0, FocalLossTest.N_CLASSES, (FocalLossTest.N_SAMPLES,)).cuda()
+            with torch.no_grad():
+                y = F.one_hot(classes, FocalLossTest.N_CLASSES).float()
+            expected = sigmoid_focal_loss(
+                x_expected,
+                y,
+                alpha=FocalLossTest.ALPHA,
+                gamma=FocalLossTest.GAMMA,
+                reduction=FocalLossTest.REDUCTION,
+            )
+            actual = sum([focal_loss.FocalLoss.apply(
+                x_actual[i:i+1],
+                classes[i:i+1].long(),
+                torch.ones([], device="cuda"),
+                FocalLossTest.N_CLASSES,
+                FocalLossTest.ALPHA,
+                FocalLossTest.GAMMA,
+                0.0,
+            ) for i in range(FocalLossTest.N_SAMPLES)])
+            # forward parity
+            torch.testing.assert_close(expected, actual)
+            expected.backward()
+            actual.backward()
+            # grad parity
+            torch.testing.assert_close(x_expected.grad, x_actual.grad)
+if __name__ == "__main__":
+    torch.manual_seed(42)
+    unittest.main()
--- a/apex/contrib/test/layer_norm/test_fast_layer_norm.py
+++ b/apex/contrib/test/layer_norm/test_fast_layer_norm.py
@@ -216,6 +216,7 @@ class TestFastLayerNorm(unittest.TestCase):
            10240,
            12288,
            12800,
+            14336,
            15360,
            16384,
            18432,

--- a/apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py
+++ b/apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py
@@ -40,7 +40,7 @@ class EncdecMultiheadAttnTest(unittest.TestCase):
                                             impl='fast')
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()
        self.tst_inputs_q = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
                                        dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
        self.tst_inputs_k = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
@@ -49,22 +49,22 @@ class EncdecMultiheadAttnTest(unittest.TestCase):
    def test_encdec_multihead_attn(self) :
        grads         = torch.randn_like(self.tst_inputs_q)
-        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs_q, 
+        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs_q,
-                                               self.ref_inputs_k, 
                                               self.ref_inputs_k,
-                                               key_padding_mask=None, 
+                                               self.ref_inputs_k,
-                                               need_weights=False, 
+                                               key_padding_mask=None,
+                                               need_weights=False,
                                               attn_mask=None,
                                               is_training=True)
-        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs_q, 
+        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs_q,
-                                               self.tst_inputs_k, 
                                               self.tst_inputs_k,
-                                               key_padding_mask=None, 
+                                               self.tst_inputs_k,
-                                               need_weights=False, 
+                                               key_padding_mask=None,
+                                               need_weights=False,
                                               attn_mask=None,
                                               is_training=True)
        self.ref_inputs_q.backward(grads)
        self.tst_inputs_q.backward(grads)

--- a/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py
+++ b/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py
@@ -48,25 +48,26 @@ class EncdecMultiheadAttnNormAddTest(unittest.TestCase):
    def test_encdec_multihead_attn_norm_add(self) :
        grads         = torch.randn_like(self.tst_inputs_q)
-        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs_q, 
-                                               self.ref_inputs_k, 
-                                               self.ref_inputs_k,
-                                               key_padding_mask=None, 
-                                               need_weights=False, 
-                                               attn_mask=None,
-                                               is_training=True)
-        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs_q, 
-                                               self.tst_inputs_k, 
-                                               self.tst_inputs_k,
-                                               key_padding_mask=None, 
-                                               need_weights=False, 
-                                               attn_mask=None,
-                                               is_training=True)
-        self.ref_inputs_q.backward(grads)
+        for _ in range(5) :
-        self.tst_inputs_q.backward(grads)
+            ref_outputs,_ = self.ref_layer.forward(self.ref_inputs_q, 
+                                                   self.ref_inputs_k, 
+                                                   self.ref_inputs_k,
+                                                   key_padding_mask=None, 
+                                                   need_weights=False, 
+                                                   attn_mask=None,
+                                                   is_training=True)
+            tst_outputs,_ = self.tst_layer.forward(self.tst_inputs_q, 
+                                                   self.tst_inputs_k, 
+                                                   self.tst_inputs_k,
+                                                   key_padding_mask=None, 
+                                                   need_weights=False, 
+                                                   attn_mask=None,
+                                                   is_training=True)
+            self.ref_inputs_q.backward(grads)
+            self.tst_inputs_q.backward(grads)
        self.assertTrue(torch.allclose(self.ref_inputs_q,  self.tst_inputs_q,  atol=1e-5, rtol=1e-5))
        self.assertTrue(torch.allclose(self.ref_inputs_k,  self.tst_inputs_k,  atol=1e-5, rtol=1e-5))

--- a/apex/contrib/test/multihead_attn/test_self_multihead_attn.py
+++ b/apex/contrib/test/multihead_attn/test_self_multihead_attn.py
@@ -15,52 +15,52 @@ class SelfMultiheadAttnTest(unittest.TestCase):
        self.heads        = 16
        self.dropout_prob = 0.0
-        self.ref_layer = SelfMultiheadAttn(self.hidden_dim, 
+        self.ref_layer = SelfMultiheadAttn(self.hidden_dim,
-                                           self.heads, 
+                                           self.heads,
-                                           dropout=self.dropout_prob, 
+                                           dropout=self.dropout_prob,
-                                           bias=False, 
+                                           bias=False,
-                                           include_norm_add=False, 
+                                           include_norm_add=False,
                                           impl='default')
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
-        self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
+        self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim,
                                      dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
-        self.tst_layer = SelfMultiheadAttn(self.hidden_dim, 
+        self.tst_layer = SelfMultiheadAttn(self.hidden_dim,
-                                           self.heads, 
+                                           self.heads,
-                                           dropout=self.dropout_prob, 
+                                           dropout=self.dropout_prob,
-                                           bias=False, 
+                                           bias=False,
-                                           include_norm_add=False, 
+                                           include_norm_add=False,
                                           impl='fast')
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()
-        self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
+        self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim,
                                      dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
-    def test_self_multihead_attn(self) :
+    def test_self_multihead_attn(self):
        grads         = torch.randn_like(self.tst_inputs)
-        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
+        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs,
-                                               self.ref_inputs, 
+                                               self.ref_inputs,
                                               self.ref_inputs,
-                                               key_padding_mask=None, 
+                                               key_padding_mask=None,
-                                               need_weights=False, 
+                                               need_weights=False,
                                               attn_mask=None,
                                               is_training=True)
-        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
+        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs,
-                                               self.tst_inputs, 
                                               self.tst_inputs,
-                                               key_padding_mask=None, 
+                                               self.tst_inputs,
-                                               need_weights=False, 
+                                               key_padding_mask=None,
+                                               need_weights=False,
                                               attn_mask=None,
                                               is_training=True)
        self.ref_inputs.backward(grads)
        self.tst_inputs.backward(grads)
@@ -73,23 +73,23 @@ class SelfMultiheadAttnTest(unittest.TestCase):
        time_mask_byte= torch.triu(torch.ones(self.tst_inputs.size(0), self.tst_inputs.size(0), device=torch.device("cuda"), dtype=torch.uint8), 1)
        time_mask_bool= time_mask_byte.to(torch.bool)
-        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
+        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs,
-                                               self.ref_inputs, 
+                                               self.ref_inputs,
                                               self.ref_inputs,
-                                               key_padding_mask=None, 
+                                               key_padding_mask=None,
-                                               need_weights=False, 
+                                               need_weights=False,
                                               attn_mask=time_mask_bool,
                                               is_training=True)
-        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
+        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs,
-                                               self.tst_inputs, 
                                               self.tst_inputs,
-                                               key_padding_mask=None, 
+                                               self.tst_inputs,
-                                               need_weights=False, 
+                                               key_padding_mask=None,
+                                               need_weights=False,
                                               attn_mask=time_mask_byte,
                                               is_training=True)
        self.ref_inputs.backward(grads)
        self.tst_inputs.backward(grads)
@@ -102,23 +102,23 @@ class SelfMultiheadAttnTest(unittest.TestCase):
        pad_mask_byte = torch.tril(torch.ones(self.tst_inputs.size(1), self.tst_inputs.size(0), device=torch.device("cuda"), dtype=torch.uint8), 1)
        pad_mask_bool = pad_mask_byte.to(torch.bool)
-        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
+        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs,
-                                               self.ref_inputs, 
+                                               self.ref_inputs,
                                               self.ref_inputs,
-                                               key_padding_mask=pad_mask_bool, 
+                                               key_padding_mask=pad_mask_bool,
-                                               need_weights=False, 
+                                               need_weights=False,
                                               attn_mask=None,
                                               is_training=True)
-        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
+        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs,
-                                               self.tst_inputs, 
                                               self.tst_inputs,
-                                               key_padding_mask=pad_mask_byte, 
+                                               self.tst_inputs,
-                                               need_weights=False, 
+                                               key_padding_mask=pad_mask_byte,
+                                               need_weights=False,
                                               attn_mask=None,
                                               is_training=True)
        self.ref_inputs.backward(grads)
        self.tst_inputs.backward(grads)

--- a/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py
+++ b/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py
@@ -45,24 +45,25 @@ class SelfMultiheadAttnNormAddTest(unittest.TestCase):
    def test_self_multihead_attn_norm_add(self) :
        grads         = torch.randn_like(self.tst_inputs)
-        ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
+        for _ in range(0, 5) :
-                                               self.ref_inputs, 
+            ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
-                                               self.ref_inputs,
+                                                   self.ref_inputs, 
-                                               key_padding_mask=None, 
+                                                   self.ref_inputs,
-                                               need_weights=False, 
+                                                   key_padding_mask=None, 
-                                               attn_mask=None,
+                                                   need_weights=False, 
-                                               is_training=True)
+                                                   attn_mask=None,
+                                                   is_training=True)
-        tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
-                                               self.tst_inputs, 
+            tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
-                                               self.tst_inputs,
+                                                   self.tst_inputs, 
-                                               key_padding_mask=None, 
+                                                   self.tst_inputs,
-                                               need_weights=False, 
+                                                   key_padding_mask=None, 
-                                               attn_mask=None,
+                                                   need_weights=False, 
-                                               is_training=True)
+                                                   attn_mask=None,
+                                                   is_training=True)
-        self.ref_inputs.backward(grads)
-        self.tst_inputs.backward(grads)
+            self.ref_inputs.backward(grads)
+            self.tst_inputs.backward(grads)
        self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
        self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3))

--- a/apex/contrib/test/optimizers/test_dist_adam.py
+++ b/apex/contrib/test/optimizers/test_dist_adam.py
+from contextlib import contextmanager
+import io
+import os
+import torch
+from torch.testing._internal import common_utils
+from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
+from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
+class SimpleModel(torch.nn.Module):
+    def __init__(self, num_layers, size):
+        super().__init__()
+        self.layers = torch.nn.ModuleList([
+            torch.nn.Linear(size, size, bias=(i%3==0))
+            for i in range(num_layers)
+        ])
+    def forward(self, x):
+        y = 0
+        for i, l in enumerate(self.layers):
+            y += (i+1) * l(x)
+        return y
+def make_models(
+        num_layers,
+        size,
+        dtype=torch.float32,
+        param_sync_dtype=None,
+        device='cuda',
+        overlap_communication=True,
+):
+    # Construct models with same parameters
+    ref_model = SimpleModel(num_layers, size).to(dtype=dtype, device=device)
+    dist_model = SimpleModel(num_layers, size).to(dtype=dtype, device=device)
+    with torch.no_grad():
+        for ref_param, dist_param in zip(dist_model.parameters(),
+                                         ref_model.parameters()):
+            dist_param.copy_(ref_param)
+    # Initialize reference model with data-parallelism
+    rank = torch.distributed.get_rank()
+    ref_model = torch.nn.parallel.DistributedDataParallel(
+        ref_model,
+        device_ids=[rank] if device=='cuda' else None,
+        output_device=rank if device=='cuda' else None,
+    )
+    # Construct optimizers with same hyperparameters
+    optim_args = dict(lr=0.1, betas=(0.1,0.2), eps=0.25, weight_decay=0.1)
+    ref_optim = torch.optim.AdamW(
+        [
+            {'params': list(ref_model.parameters())[1::2], 'lr': 0.2},
+            {'params': list(ref_model.parameters())[0::2]},
+        ],
+        **optim_args,
+    )
+    dist_optim = DistributedFusedAdam(
+        [
+            {'params': list(dist_model.parameters())[1::2], 'lr': 0.2},
+            {'params': list(dist_model.parameters())[0::2]},
+        ],
+        overlap_grad_sync=overlap_communication,
+        bucket_cap_mb=71/(4*1024*1024),
+        dtype=torch.float32,
+        param_sync_dtype=param_sync_dtype,
+        **optim_args,
+    )
+    return ref_model, ref_optim, dist_model, dist_optim
+@contextmanager
+def dummy_context():
+    try:
+        yield
+    finally:
+        pass
+class TestDistributedFusedAdam(NcclDistributedTestBase):
+    seed = 1234
+    def test_matches_pytorch(
+            self,
+            num_layers=11,
+            layer_size=7,
+            batch_size=3,
+            num_steps=3,
+            micro_batch_steps=3,
+            overlap_communication=True,
+            use_nosync=True,
+            dtype=torch.float32,
+            param_sync_dtype=None,
+            device='cuda',
+            rtol=None,
+            atol=None,
+    ):
+        torch.manual_seed(self.seed + self.rank)
+        # Identical models with data-parallel and ZeRO
+        ref_model, ref_optim, dist_model, dist_optim = make_models(
+            num_layers,
+            layer_size,
+            dtype=dtype,
+            param_sync_dtype=param_sync_dtype,
+            device=device,
+            overlap_communication=overlap_communication,
+        )
+        # Training loop
+        for step in range(num_steps):
+            # Reset gradients
+            ref_optim.zero_grad()
+            dist_optim.zero_grad()
+            # Forward and backward passes
+            for micro_step in range(micro_batch_steps):
+                # Synthetic data
+                x = torch.rand(batch_size, layer_size) - 0.5
+                dy = torch.rand_like(x) - 0.5
+                x = x.to(dtype=dtype, device=device)
+                dy = dy.to(dtype=dtype, device=device)
+                # Reference implementation
+                x_ref = x.detach().clone().requires_grad_(True)
+                y_ref = ref_model(x_ref)
+                y_ref.backward(dy)
+                # Distributed implementation
+                x_dist = x.detach().clone().requires_grad_(True)
+                y_dist = dist_model(x_dist)
+                backward_context = dummy_context
+                if use_nosync and micro_step < micro_batch_steps-1:
+                    backward_context = dist_optim.no_sync
+                with backward_context():
+                    y_dist.backward(dy)
+                # Check that data tensors match
+                torch.testing.assert_close(
+                    y_dist, y_ref, rtol=rtol, atol=atol)
+                torch.testing.assert_close(
+                    x_dist.grad, x_ref.grad, rtol=rtol, atol=atol)
+            # Optimization step
+            ref_optim.step()
+            dist_optim.step()
+            # Check that parameters match
+            for ref_param, dist_param in zip(ref_model.parameters(),
+                                             dist_model.parameters()):
+                torch.testing.assert_close(
+                    dist_param, ref_param, rtol=rtol, atol=atol)
+    def test_matches_pytorch_no_overlap(self):
+        self.test_matches_pytorch(
+            overlap_communication=False,
+            use_nosync=False,
+        )
+    def test_matches_pytorch_sync_every_step(self):
+        self.test_matches_pytorch(use_nosync=False)
+    def test_matches_pytorch_fp64(self):
+        self.test_matches_pytorch(
+            dtype=torch.float64,
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+    def test_matches_pytorch_fp16(self):
+        self.test_matches_pytorch(
+            dtype=torch.float16,
+            rtol=1e-2,
+            atol=1e-2,
+        )
+    def test_matches_pytorch_allgather_fp16(self):
+        self.test_matches_pytorch(
+            dtype=torch.float32,
+            param_sync_dtype=torch.float16,
+            rtol=1e-2,
+            atol=1e-2,
+        )
+    def test_raises_on_mismatch(self):
+        torch.manual_seed(self.seed + self.rank)
+        # Identical models with data-parallel and ZeRO
+        num_layers = 11
+        layer_size = 7
+        ref_model, ref_optim, dist_model, dist_optim = make_models(
+            num_layers,
+            layer_size,
+        )
+        # Only perform training step with distributed model
+        dist_optim.zero_grad()
+        x = torch.rand(3, layer_size) + 0.5
+        x = x.to(dtype=torch.float32, device='cuda')
+        dy = torch.rand_like(x) + 0.5
+        y = dist_model(x)
+        y.backward(dy)
+        dist_optim.step()
+        # Check that parameters do not match
+        for ref_param, dist_param in zip(ref_model.parameters(),
+                                         dist_model.parameters()):
+            self.assertRaises(
+                AssertionError,
+                torch.testing.assert_close,
+                dist_param, ref_param,
+            )
+    def test_clip_grad_norm(self):
+        torch.manual_seed(self.seed + self.rank)
+        # Identical models with data-parallel and ZeRO
+        ref_model, ref_optim, dist_model, dist_optim = make_models(1, 1)
+        # Training steps with pre-determined gradients
+        xs = [3, 1, 4, 1, 5, 9]
+        dys = [1, -1, 1, -1, 1, -1]
+        for x, dy in zip(xs, dys):
+            x = torch.tensor([x], dtype=torch.float32, device='cuda')
+            dy = torch.tensor([dy], dtype=torch.float32, device='cuda')
+            # Reference implementation
+            ref_optim.zero_grad()
+            y_ref = ref_model(x.detach())
+            y_ref.backward(dy.detach())
+            ref_grad_norm = torch.nn.utils.clip_grad_norm_(ref_model.parameters(), 3.5)
+            ref_optim.step()
+            # Distributed implementation
+            dist_optim.zero_grad()
+            y_dist = dist_model(x.detach())
+            y_dist.backward(dy.detach())
+            dist_grad_norm = dist_optim.clip_grad_norm(3.5)
+            dist_optim.step()
+            # Check that parameters match
+            torch.testing.assert_close(dist_grad_norm, ref_grad_norm)
+            for ref_param, dist_param in zip(ref_model.parameters(),
+                                             dist_model.parameters()):
+                torch.testing.assert_close(dist_param, ref_param)
+    def test_grad_scaler(self):
+        torch.manual_seed(self.seed + self.rank)
+        # Identical models with data-parallel and ZeRO
+        ref_model, ref_optim, dist_model, dist_optim = make_models(1, 1)
+        grad_scaler_args = dict(
+            init_scale=3.21,
+            growth_factor=1.23,
+            backoff_factor=0.876,
+            growth_interval=1,
+        )
+        ref_scaler =  torch.cuda.amp.GradScaler(**grad_scaler_args)
+        dist_scaler =  torch.cuda.amp.GradScaler(**grad_scaler_args)
+        # Training steps with pre-determined gradients
+        xs = [3, 1, 4, 1, 5, 9]
+        dys = [1, float('inf'), 1, 1, float('nan'), -1]
+        for x, dy in zip(xs, dys):
+            x = torch.tensor([x], dtype=torch.float32, device='cuda')
+            dy = torch.tensor([dy], dtype=torch.float32, device='cuda')
+            # Reference implementation
+            ref_optim.zero_grad()
+            y_ref = ref_model(x.detach())
+            ref_scaler.scale(y_ref).backward(dy.detach())
+            ref_scaler.step(ref_optim)
+            ref_scaler.update()
+            # Distributed implementation
+            dist_optim.zero_grad()
+            y_dist = dist_model(x.detach())
+            dist_scaler.scale(y_dist).backward(dy.detach())
+            dist_scaler.step(dist_optim)
+            dist_scaler.update()
+            # Check that parameters match
+            for ref_param, dist_param in zip(ref_model.parameters(),
+                                             dist_model.parameters()):
+                torch.testing.assert_close(dist_param, ref_param)
+    def test_checkpoint(self):
+        # Construct two models with same config and different params
+        num_layers = 5
+        layer_size = 2
+        torch.manual_seed(self.seed + self.rank)
+        _, _, model_save, optim_save = make_models(num_layers, layer_size)
+        _, _, model_load, optim_load = make_models(num_layers, layer_size)
+        # Train one of the models
+        num_steps = 3
+        micro_batch_steps = 2
+        batch_size = 4
+        for step in range(num_steps):
+            optim_save.zero_grad()
+            for micro_step in range(micro_batch_steps):
+                x = torch.rand(batch_size, layer_size) - 0.5
+                dy = torch.rand_like(x) - 0.5
+                x = x.cuda()
+                dy = dy.cuda()
+                y = model_save(x)
+                y.backward(dy)
+            optim_save.step()
+        # Make sure models are different
+        for param_save, param_load in zip(model_save.parameters(),
+                                          model_load.parameters()):
+            self.assertRaises(
+                AssertionError,
+                torch.testing.assert_close,
+                param_load, param_save,
+            )
+        # Save state on root rank and load on all ranks
+        state_dict = {
+            'model': model_save.state_dict(),
+            'optim': optim_save.state_dict(),
+        }
+        if self.rank == 0:
+            state_bytes = io.BytesIO()
+            torch.save(state_dict, state_bytes)
+            state_bytes = [state_bytes.getvalue()]
+        else:
+            state_bytes = [None]
+        torch.distributed.broadcast_object_list(state_bytes, src=0)
+        state_bytes = io.BytesIO(state_bytes[0])
+        state_dict = torch.load(state_bytes, map_location='cuda')
+        model_load.load_state_dict(state_dict['model'])
+        optim_load.load_state_dict(state_dict['optim'])
+        # Make sure models are identical
+        for param_save, param_load in zip(model_save.parameters(),
+                                          model_load.parameters()):
+            torch.testing.assert_close(param_load, param_save)
+        # Train both models
+        num_steps = 3
+        micro_batch_steps = 3
+        batch_size = 5
+        for step in range(num_steps):
+            # Reset gradients
+            optim_save.zero_grad()
+            optim_load.zero_grad()
+            # Forward and backward passes
+            for micro_step in range(micro_batch_steps):
+                # Synthetic data
+                x = torch.rand(batch_size, layer_size) - 0.5
+                dy = torch.rand_like(x) - 0.5
+                x = x.cuda()
+                dy = dy.cuda()
+                # Forward and backward pass
+                x_save = x.detach().clone().requires_grad_(True)
+                y_save = model_save(x_save)
+                y_save.backward(dy)
+                x_load = x.detach().clone().requires_grad_(True)
+                y_load = model_load(x_load)
+                y_load.backward(dy)
+                # Check that data tensors match
+                torch.testing.assert_close(y_load, y_save)
+                torch.testing.assert_close(x_load.grad, x_save.grad)
+            # Optimizer step
+            optim_save.step()
+            optim_load.step()
+            # Check that parameters match
+            for param_save, param_load in zip(model_save.parameters(),
+                                              model_load.parameters()):
+                torch.testing.assert_close(param_load, param_save)
+if __name__ == "__main__":
+    # Assume script has been run with torchrun
+    common_utils.run_tests()
--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
@@ -115,6 +115,7 @@ class FusedAdam(torch.optim.Optimizer):
            # create lists for multi-tensor apply
            g_16, p_16, m_16, v_16 = [], [], [], []
+            g_bf, p_bf, m_bf, v_bf = [], [], [], []
            g_32, p_32, m_32, v_32 = [], [], [], []
            for p in group['params']:
@@ -136,6 +137,11 @@ class FusedAdam(torch.optim.Optimizer):
                    p_16.append(p.data)
                    m_16.append(state['exp_avg'])
                    v_16.append(state['exp_avg_sq'])
+                elif p.dtype == torch.bfloat16:
+                    g_bf.append(p.grad)
+                    p_bf.append(p)
+                    m_bf.append(state['exp_avg'])
+                    v_bf.append(state['exp_avg_sq'])
                elif p.dtype == torch.float32:
                    g_32.append(p.grad.data)
                    p_32.append(p.data)
@@ -156,6 +162,20 @@ class FusedAdam(torch.optim.Optimizer):
                                     self.adam_w_mode,
                                     bias_correction,
                                     group['weight_decay'])
+            if g_bf:
+                multi_tensor_applier(
+                    self.multi_tensor_adam,
+                    self._dummy_overflow_buf,
+                    [g_bf, p_bf, m_bf, v_bf],
+                    group['lr'],
+                    beta1,
+                    beta2,
+                    group['eps'],
+                    group['step'],
+                    self.adam_w_mode,
+                    bias_correction,
+                    group['weight_decay'],
+                )
            if(len(g_32) > 0):
                multi_tensor_applier(self.multi_tensor_adam,
                                     self._dummy_overflow_buf,

--- a/apex/pyprof/FAQs.md
+++ b/apex/pyprof/FAQs.md
-1. How do I intercept the Adam optimizer in APEX ?
-	```python
-	from apex import pyprof
-	import fused_adam_cuda
-	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
-	```
-2. If you are using JIT and/or AMP, the correct initialization sequence is
-	1. Let any JIT to finish.
-	2. Initlialize pyprof `pyprof.nvtx.init()`.
-	3. Initialize AMP.
-3. How do I profile with `torch.distributed.launch` ?
-	```python
-	nvprof -f -o net%p.sql \
-		--profile-from-start off \
-		--profile-child-processes \
-		python -m torch.distributed.launch net.py
-	```
--- a/apex/pyprof/README.md
+++ b/apex/pyprof/README.md
-## PyProf - PyTorch Profiling tool
-### What does this tool do?                                                                                                                                                                                                                  
-Analyzing the performance of deep neural networks is hard. Getting kernels out of [NvProf]([https://developer.nvidia.com/nvidia-visual-profiler](https://developer.nvidia.com/nvidia-visual-profiler)) or [NSight Compute]([https://developer.nvidia.com/nsight-compute](https://developer.nvidia.com/nsight-compute)) provides some generic kernel name and its execution time, but not detailed information regarding the following:
- - Which layer launched it: e.g. the association of `ComputeOffsetsKernel` with a concrete PyTorch layer or API is not obvious.
- - What the tensor dimensions and precision were: without knowing the tensor dimensions and precision, it's impossible to reason about whether the actual (silicon) kernel time is close to maximum performance of such a kernel on the GPU. Knowing the tensor dimensions and precision, we can figure out the FLOPs and bandwidth required by a layer, and then determine how close to maximum performance the kernel is for that operation.
- - Forward-backward correlation: currently it's very hard to determine what the forward pass step was that resulted in the particular weight and data gradients (wgrad, dgrad), which makes it difficult to determine the tensor dimensions required by these backprop steps to assess their performance.
- - Did the kernel use [Tensor Cores]([https://www.youtube.com/watch?v=yyR0ZoCeBO8](https://www.youtube.com/watch?v=yyR0ZoCeBO8))?
- - Which line in the user's code resulted in launching this particular kernel (program trace)?
-PyProf addresses all of the issues above by:
- 1. Instrumenting PyTorch operations to capture the tensor dimensions and precision using [NVTX](https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx). This information is recorded at profile capture time, e.g. using [NvProf](https://developer.nvidia.com/nvidia-visual-profiler).
- 2. Querying the record produced by the profiler to correlate the kernel name and duration with PyTorch API/layer name, tensor dimensions, tensor precision, as well as calculating FLOPs and bandwidth for common operations. In addition, extra information from the profile is added for use by CUDA professionals, such as CUDA launch parameters (block/grid dimensions).
-Regarding FLOP and bandwidth implementations, these are usually quite straightforward. For example, for matrices A<sub>MxK</sub> and B<sub>KxN</sub>, the FLOP count for a matrix multiplication is 2 * M * N * K, and bandwidth is M * K + N * K + M * N. Note that these numbers are based on the algorithm, not the actual performance of the specific kernel. For more details, see NVIDIA's [Deep Learning Performance Guide](https://docs.nvidia.com/deeplearning/sdk/dl-performance-guide/index.html).
-Armed with such information, the user can determine various issues to help them tune the network. For instance, according to the [Tensor Core Performance Guide]([https://docs.nvidia.com/deeplearning/sdk/dl-performance-guide/index.html](https://docs.nvidia.com/deeplearning/sdk/dl-performance-guide/index.html)), the M, N and K dimensions that result in Tensor Core usage need to be divisible by 8. In fact, PyProf comes with a flag that lets the user obtain information regarding whether Tensor Cores were used by the kernel. Other useful information might include knowing that a particular kernel did not exploit much thread parallelism, as determined by the grid/block dimensions. Since many PyTorch kernels are open-source (or even custom written by the user, as in [CUDA Extensions]([https://pytorch.org/tutorials/advanced/cpp_extension.html](https://pytorch.org/tutorials/advanced/cpp_extension.html))), this provides the user with information that helps root cause performance issues and prioritize optimization work.
-### How to get started?
-1. Add the following lines to your PyTorch network:
-    ```python
-    import torch.cuda.profiler as profiler
-    from apex import pyprof
-    pyprof.nvtx.init()
-    ```
-    Run the training/inference loop with the [PyTorch's NVTX context manager](https://pytorch.org/docs/stable/_modules/torch/autograd/profiler.html#emit_nvtx)
-    `with torch.autograd.profiler.emit_nvtx()`. Optionally, you can
-    use `profiler.start()` and `profiler.stop()` to pick an iteration
-    (say after warm-up) for which you would like to capture data.
-    Here's an example:
-    ```python
-    iters = 500
-    iter_to_capture = 100
-    # Define network, loss function, optimizer etc.
-    # PyTorch NVTX context manager
-    with torch.autograd.profiler.emit_nvtx():
-        for iter in range(iters):
-            if iter == iter_to_capture:
-                profiler.start()
-            output = net(images)
-            loss = criterion(output, labels)
-            loss.backward()
-            optimizer.step()
-            if iter == iter_to_capture:
-                profiler.stop()
-    ```
-2. Run NVprof to generate a SQL (NVVP) file. This file can be opened with NVVP, as usual.
-    ```sh
-    # If you used profiler.start() and profiler.stop() in net.py
-    nvprof -f -o net.sql --profile-from-start off -- python net.py
-    # Profile everything
-    nvprof -f -o net.sql -- python net.py
-    ```
-**Note:** if you're experiencing issues with hardware counters and you get a message such as `**_ERR_NVGPUCTRPERM The user running <tool_name/application_name> does not have permission to access NVIDIA GPU Performance Counters on the target device_**`, please follow the steps described in [Hardware Counters](#hardware-counters).
-3. Run parser on the SQL file. The output is an ASCII file. Each line
-is a python dictionary which contains information about the kernel name,
-duration, parameters etc. This file can be used as input to other custom
-scripts as well.
-    ```sh
-    python -m apex.pyprof.parse net.sql > net.dict
-    ```
-4. Run the profiler. The input is the python dictionary created above. The tool can produce a CSV output, a columnated output (similar to `column -t` for terminal readability) and a space separated output (for post processing by AWK for instance). The tool produces 20 columns of information for every GPU kernel but you can select a subset of columns using the `-c` flag. Note that a few columns might have the value "na" implying either its a work in progress or the tool was unable to extract that information. Assuming the directory is `prof`, here are a few examples of how to use `prof.py`.
-    ```sh
-	# Print usage and help. Lists all available output columns.
-    python -m apex.pyprof.prof -h
-	# Columnated output of width 150 with some default columns.
-    python -m apex.pyprof.prof -w 150 net.dict
-	# CSV output.
-    python -m apex.pyprof.prof --csv net.dict
-	# Space seperated output.
-    python -m apex.pyprof.prof net.dict
-	# Columnated output of width 130 with columns index,direction,kernel name,parameters,silicon time.
-    python -m apex.pyprof.prof -w 130 -c idx,dir,kernel,params,sil net.dict
-	# CSV output with columns index,direction,kernel name,parameters,silicon time.
-    python -m apex.pyprof.prof --csv -c idx,dir,kernel,params,sil net.dict
-	# Space separated output with columns index,direction,kernel name,parameters,silicon time.
-    python -m apex.pyprof.prof -c idx,dir,kernel,params,sil net.dict
-	# Input redirection.
-    python -m apex.pyprof.prof < net.dict
-    ```
-5. Profile-guided optimization
-If kernels that do matrix multiplication/GEMM or convolution use half precision (fp16) data but do not use Tensor Cores (the TC column in the profile analysis output doesn't show a "1"), one can follow some basic steps to increase the likelihood that a Tensor Core-compatible kernel will be chosen. For example, for GEMMs, M, N and K should be divisible by 8, and for convolutions, the number of input and output channels shuold be divisible by 8. For more information, see detailed Tensor Core guides such as:
- Blog Post: [Tips for Optimizing GPU Performance Using Tensor Cores](https://devblogs.nvidia.com/optimizing-gpu-performance-tensor-cores/)
- GTC Talk: [Tensor Core Deep Learning Performance Guide](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9926-tensor-core-performance-the-ultimate-guide.pdf)
-For both Tensor Core and non-Tensor Core Deep Learning performance optimization tips, see NVIDIA's [Deep Learning Performance Guide](https://docs.nvidia.com/deeplearning/sdk/dl-performance-guide/index.html).
-### TODOs
-1. The support for conv transpose is currently missing.
-2. PyProf currently works only with NvProf, but Nsight Compute support will be added in the future.
-### Example
-1. Run `nvprof` on the LeNet model in `examples/lenet.py`. This will output a SQL file called `net.sql`.
-```sh
-nvprof -f -o net.sql --profile-from-start off -- python examples/lenet.py
-```
-**Note**: DO NOT add --analysis-metrics since that will change which table nvprof writes the kernels to (`CUPTI_ACTIVITY_KIND_KERNEL` instead of the usual `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL`). Support for running with metrics may be added in the future.
-If you don't care about a full correlation analysis and you'd just like to view the timeline with detailed NVTX annotations, you can do so, e.g. in the NVIDIA Visual Profiler (NVVP). For example, you can call `nvvp net.sql` to view the annotated timeline.
-2. Run the `parse.py` script on `net.sql` to extract kernel and runtime information and
-save it as `net.dict`.
-```sh
-python -m apex.pyprof.parse net.sql > net.dict
-```
-This will produce a text file, which can be parsed by any external tool, but it can also be directly read one line at a time by Python by calling `eval` on the line being read. 
-**Note: you do not need to process this output manually.**  Here the output is just shown as an example of modularity - you can process the raw data yourself, or let the next step enrich the information further and dump a CSV.
-The output of this step will look as follows. Note that the dictionary has a lot more keys than the ones shown in the example.
-```
->>> with open('torchvision.resnet50.adam.64.dict') as f:
-...     for line in f:
-...         d = eval(line)
-...         print(d['kShortName'], d['op'], d['kDuration'], d['block'], d['grid'], d['device'], d['stream'], d['trace'])
-... 
-nchwToNhwc3To4Kernel ['conv2d'] 376324 (256, 1, 1) (1568, 1, 64) 0 7 ['imagenet.py:137', 'imagenet.py:129', '/opt/conda/lib/python3.6/site-packages/torchvision/models/resnet.py:195']
-generic4Channel_kernel ['conv2d'] 10720 (512, 1, 1) (19, 1, 1) 0 7 ['imagenet.py:137', 'imagenet.py:129', '/opt/conda/lib/python3.6/site-packages/torchvision/models/resnet.py:195']
-first_layer_fwd_kernel ['conv2d'] 411204 (128, 1, 1) (2, 7, 64) 0 7 ['imagenet.py:137', 'imagenet.py:129', '/opt/conda/lib/python3.6/site-packages/torchvision/models/resnet.py:195']
-nhwcToNchwKernel ['conv2d'] 342371 (256, 1, 1) (392, 2, 64) 0 7 ['imagenet.py:137', 'imagenet.py:129', '/opt/conda/lib/python3.6/site-packages/torchvision/models/resnet.py:195']
-elementwise_kernel ['__iadd__'] 2816 (128, 1, 1) (1, 1, 1) 0 7 ['imagenet.py:137', 'imagenet.py:129', '/opt/conda/lib/python3.6/site-packages/torchvision/models/resnet.py:196']
-batch_norm_collect_statistics_kernel ['batch_norm', 'batch_norm'] 929513 (512, 1, 1) (64, 1, 1) 0 7 ['imagenet.py:137', 'imagenet.py:129', '/opt/conda/lib/python3.6/site-packages/torchvision/models/resnet.py:196']
-```
-3. Run the `prof.py` script on `net.dict` to summarize the results into a CSV file, or to display the pretty-printed results on the screen. This step processes the raw output from step 2 to generate a nice output, but it also adds a lot of extra useful information inferred from the previous step, such as:
- FLOPs
- bandwidth (bytes in and out of GPU DRAM)
- tensor core usage
-```sh
-python -m apex.pyprof.prof --csv net.dict > results.csv
-```
-You can choose which columns you'd like to display. Here's a list from calling `python -m apex.pyprof.prof -h`:
-```
-              idx:      Index
-              seq:      PyTorch Sequence Id
-              altseq:   PyTorch Alternate Sequence Id
-              tid:      Thread Id
-              layer:    User annotated NVTX string (can be nested)
-              trace:    Function Call Trace
-              dir:      Direction
-              sub:      Sub Sequence Id
-              mod:      Module
-              op:       Operation
-              kernel:   Kernel Name
-              params:   Parameters
-              sil:      Silicon Time (in ns)
-              tc:       Tensor Core Usage
-              device:   GPU Device Id
-              stream:   Stream Id
-              grid:     Grid Dimensions
-              block:    Block Dimensions
-              flops:    Floating point ops (FMA = 2 FLOPs)
-              bytes:    Number of bytes in and out of DRAM
-```              
-Let's have a look at the pretty-printed output:
-```
-python -m apex.pyprof.prof -w 100 -c kernel,op,sil,tc,flops,bytes,device,stream,block,grid torchvision.resnet50.adam.64.dict
-Kernel              Op              Sil(ns)    TC FLOPs        Bytes        Dev Str Block        Grid         
-elementwise_kernel  relu                381028 -      51380224    205520896   0   7 512,1,1      100352,1,1   
-volta_fp16_s884cudn conv2d              160002 1    1644167168     51388416   0   7 256,1,1      784,1,1      
-elementwise_kernel  relu                 96545 -      12845056     51380224   0   7 512,1,1      25088,1,1    
-volta_fp16_s884cudn conv2d              346083 1    6576668672    128483328   0   7 256,1,1      784,2,1      
-```
-Not using the pretty-print width (`-w`) option and adding `--csv` results in a CSV output instead:
-```
-python -m apex.pyprof.prof --csv -c kernel,mod,op,dir,sil,tc,flops,bytes,device,stream,block,grid torchvision.resnet50.adam.64.dict
-"Kernel","Module","Op","Direction","Sil(ns)","TC","FLOPs","Bytes","Device","Stream","Block","Grid"
-"nchwToNhwc3To4Kernel","torch.nn.functional","conv2d","fprop","376324","-","0","0","0","7","256,1,1","1568,1,64"
-"generic4Channel_kernel","torch.nn.functional","conv2d","fprop","10720","-","0","0","0","7","512,1,1","19,1,1"
-"first_layer_fwd_kernel","torch.nn.functional","conv2d","fprop","411204","-","0","0","0","7","128,1,1","2,7,64"
-"nhwcToNchwKernel","torch.nn.functional","conv2d","fprop","342371","-","0","0","0","7","256,1,1","392,2,64"
-"elementwise_kernel","Tensor","__iadd__","fprop","2816","-","1.0","8","0","7","128,1,1","1,1,1"
-"batch_norm_collect_statistics_kernel","torch.nn.functional","batch_norm","fprop","929513","-","411041792","411041792","0","7","512,1,1","64,1,1"
-"batch_norm_transform_input_kernel","torch.nn.functional","batch_norm","fprop","377539","-","411041792","411041792","0","7","512,1,1","64,64,1"
-"elementwise_kernel","torch.nn.functional","relu","fprop","381028","-","51380224","205520896","0","7","512,1,1","100352,1,1"
-"MaxPoolForward","torch.nn.functional","max_pool2d","fprop","406531","-","0","0","0","7","256,1,1","50176,1,1"
-"cudnn::gemm::computeOffsetsKernel","torch.nn.functional","conv2d","fprop","2464","-","0","0","0","7","128,1,1","25,1,1"
-```
-### Hardware Counters
-Profiling GPU workloads may require access to [hardware performance counters]([https://en.wikipedia.org/wiki/Hardware_performance_counter](https://en.wikipedia.org/wiki/Hardware_performance_counter)). Due to a [fix](https://nvidia.custhelp.com/app/answers/detail/a_id/4738) in recent NVIDIA drivers addressing [CVE‑2018‑6260](https://nvd.nist.gov/vuln/detail/CVE-2018-6260), the hardware counters are disabled by default, and require elevated privileges to be enabled again. If you're using a recent driver, you may see the following message when trying to run nvprof:
-```**_ERR_NVGPUCTRPERM The user running <tool_name/application_name> does not have permission to access NVIDIA GPU Performance Counters on the target device._**```
-For details, see [here](https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters).
-_Permanent solution_
-Follow the steps [here]([https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters](https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters)). The current steps for Linux are:
-```
-sudo systemctl isolate multi-user
-sudo modprobe -r nvidia_uvm nvidia_drm nvidia_modeset nvidia-vgpu-vfio nvidia
-sudo modprobe nvidia NVreg_RestrictProfilingToAdminUsers=0
-sudo systemctl isolate graphical
-```
-The above steps should result in a permanent change.
-_Temporary solution_
-When running on bare metal, you can run nvprof with `sudo`.
-If you're running in a Docker image, you can temporarily elevate your privileges with one of the following (oldest to newest syntax):
-<pre>
-nvidia-docker run <b>--privileged</b>
-docker run --runtime nvidia <b>--privileged</b>
-docker run --gpus all <b>--privileged<b>
-</pre>
--- a/apex/pyprof/__init__.py
+++ b/apex/pyprof/__init__.py
-import warnings
-from . import nvtx, prof
--- a/apex/pyprof/examples/.gitignore
+++ b/apex/pyprof/examples/.gitignore
-__pycache__
-*.sql
-*.dict
-*.csv
--- a/apex/pyprof/examples/apex/README.md
+++ b/apex/pyprof/examples/apex/README.md
-This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
--- a/apex/pyprof/examples/apex/fused_adam.py
+++ b/apex/pyprof/examples/apex/fused_adam.py
-import torch
-import fused_adam_cuda
-from apex.optimizers import FusedAdam, FP16_Optimizer
-from apex import pyprof
-pyprof.nvtx.init()
-pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
-model = torch.nn.Linear(10, 20).cuda().half()
-criterion = torch.nn.CrossEntropyLoss().cuda()
-optimizer = FusedAdam(model.parameters())
-optimizer = FP16_Optimizer(optimizer)
-x = torch.ones(32, 10).cuda().half()
-target = torch.empty(32, dtype=torch.long).random_(20).cuda()
-y = model(x)
-loss = criterion(y, target)
-optimizer.zero_grad()
-loss.backward()
-optimizer.step()
--- a/apex/pyprof/examples/apex/fused_layer_norm.py
+++ b/apex/pyprof/examples/apex/fused_layer_norm.py
-import torch
-import fused_layer_norm_cuda
-from apex.normalization import FusedLayerNorm
-from apex import pyprof
-pyprof.nvtx.init()
-pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward')
-pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward')
-pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine')
-pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine')
-input = torch.randn(20, 5, 10, 10).cuda()
-# With Learnable Parameters
-m = FusedLayerNorm(input.size()[1:]).cuda()
-output = m(input)
-# Without Learnable Parameters
-m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
-output = m(input)
-# Normalize over last two dimensions
-m = FusedLayerNorm([10, 10]).cuda()
-output = m(input)
-# Normalize over last dimension of size 10
-m = FusedLayerNorm(10).cuda()
-output = m(input)
--- a/apex/pyprof/examples/apex/test.sh
+++ b/apex/pyprof/examples/apex/test.sh
-#!/bin/bash
-set -e
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-parse="python $PYPROF/parse/parse.py"
-prof="python $PYPROF/prof/prof.py"
-for f in *.py
-do
-	base=`basename $f .py`
-	sql=$base.sql
-	dict=$base.dict
-	#NVprof
-	echo "nvprof -fo $sql python $f"
-	nvprof -fo $sql python $f
-	#Parse
-	echo $parse $sql
-	$parse $sql > $dict
-	#Prof
-	echo $prof $dict
-	$prof -w 130 $dict
-	\rm $sql $dict
-done
--- a/apex/pyprof/examples/custom_func_module/README.md
+++ b/apex/pyprof/examples/custom_func_module/README.md
-This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class.