Merge branch 'master' into deyuf/fused_optimizer_v2

aee5aff4 · Deyu Fu · 007c5947 · 880ab925 · aee5aff4 · aee5aff4
Commit aee5aff4 authored Aug 13, 2019 by Deyu Fu
11 changed files
--- a/apex/pyprof/prof/usage.py
+++ b/apex/pyprof/prof/usage.py
+import sys
+import argparse
+
+def parseArgs():
+	"""
+	Print usage and parse arguments.
+	"""
+
+	def check_cols(value):
+		valid = ["idx", "seq", "altseq", "tid", "layer", "trace", "dir", "sub", "mod", "op", "kernel", "params", "sil", "tc", "device", "stream", "grid", "block", "flops", "bytes"]
+		cols = value.split(",")
+		for col in cols:
+			if col not in valid:
+				raise argparse.ArgumentTypeError("{} is not a valid column name. Valid column names are {}.".format(col, ",".join(valid)))
+		return cols
+
+	def openFile(f):
+		try:
+			d = open(f, "r")
+			return d
+		except IOError:
+			print("Error opening file {}. Exiting.".format(f), file=sys.stderr)
+			sys.exit(1)
+
+	parser = argparse.ArgumentParser(prog=sys.argv[0], description="PyTorch Profiler", formatter_class=argparse.RawTextHelpFormatter)
+	parser.add_argument("file",
+		nargs='?',
+		type=str,
+		default=None,
+		help="Output of parse.py (Python dictionary).")
+
+	parser.add_argument("-c",
+		type=check_cols,
+		default="idx,dir,sub,mod,op,kernel,params,sil",
+		help='''Comma seperated names of columns to print.
+idx:      Index
+seq:      PyTorch Sequence Id
+altseq:   PyTorch Alternate Sequence Id
+tid:      Thread Id
+layer:    User annotated NVTX string (can be nested)
+trace:    Function Call Trace
+dir:      Direction
+sub:      Sub Sequence Id
+mod:      Module
+op:       Operattion
+kernel:   Kernel Name
+params:   Parameters
+sil:      Silicon Time (in ns)
+tc:       Tensor Core Usage
+device:   GPU Device Id
+stream:   Stream Id
+grid:     Grid Dimensions
+block:    Block Dimensions
+flops:    Floating point ops (FMA = 2 FLOPs)
+bytes:    Number of bytes in and out of DRAM
+e.g. -c idx,kernel,sil''')
+
+	group = parser.add_mutually_exclusive_group()
+	group.add_argument("--csv",
+		action="store_true",
+		default=False,
+		help="Print a CSV output.")
+	group.add_argument("-w",
+		type=int,
+		default=0,
+		help="Width of columnated output.")
+
+	args = parser.parse_args()
+	if args.file is None:
+		args.file = sys.stdin
+	else:
+		args.file = openFile(args.file)
+	return args
--- a/apex/pyprof/prof/utility.py
+++ b/apex/pyprof/prof/utility.py
+from functools import reduce
+
+class Utility(object):
+
+	@staticmethod
+	def numElems(shape):
+		assert (type(shape) == tuple)
+		return reduce(lambda x,y: x*y, shape, 1)
+
+	@staticmethod
+	def typeToBytes(t):
+		if (t in ["uint8", "int8", "byte", "char"]):
+			return 1
+		elif (t in ["float16", "half", "int16", "short"]):
+			return 2 
+		elif (t in ["float32", "float", "int32", "int"]):
+			return 4
+		elif (t in ["int64", "long", "float64", "double"]):
+			return 8
+		assert False
+
+	@staticmethod
+	def typeToString(t):
+		if (t in ["uint8", "byte", "char"]):
+			return "uint8"
+		elif (t in ["int8",]):
+			return "int8"
+		elif (t in ["int16", "short",]):
+			return "int16"
+		elif (t in ["float16", "half"]):
+			return "fp16"
+		elif (t in ["float32", "float"]):
+			return "fp32"
+		elif (t in ["int32", "int",]):
+			return "int32"
+		elif (t in ["int64", "long"]):
+			return "int64"
+		elif (t in ["float64", "double",]):
+			return "fp64"
+		assert False
+
+	@staticmethod
+	def hasNVTX(marker):
+		if type(marker) is str:
+			try:
+				marker = eval(marker)
+			except:
+				return False
+
+		if type(marker) is dict:
+			keys  = marker.keys()
+			return ("mod" in keys) and ("op" in keys) and ("args" in keys)
+		else:
+			return False
+
+	@staticmethod
+	def isscalar(t):
+		return (t in ["float", "int"])
--- a/csrc/layer_norm_cuda_kernel.cu
+++ b/csrc/layer_norm_cuda_kernel.cu
@@ -795,11 +795,13 @@ void cuda_layer_norm_gradient(
 	    invvar->data<accscalar_t>(),
 	    input,
 	    n1,n2,
-	    gamma->data<scalar_t_0>(),
-	    beta->data<scalar_t_0>(),
+            // TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
+            // if gamma Tensor is NULL on input.
+	    gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
+	    gamma != NULL ? beta->data<scalar_t_0>() : NULL,
 	    epsilon,
 	    grad_input->data<scalar_t_0>(),
-	    grad_gamma->data<scalar_t_0>(),
-	    grad_beta->data<scalar_t_0>());
+	    gamma != NULL ? grad_gamma->data<scalar_t_0>() : NULL,
+	    gamma != NULL ? grad_beta->data<scalar_t_0>() : NULL);
      )
 }
--- a/examples/dcgan/README.md
+++ b/examples/dcgan/README.md
-Under construction...
+# Mixed Precision DCGAN Training in PyTorch
+
+`main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/dcgan](https://github.com/pytorch/examples/tree/master/dcgan).
+It implements Automatic Mixed Precision (Amp) training of the DCGAN example for different datasets. Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
+
+We introduce these changes to the PyTorch DCGAN example as described in the [Multiple models/optimizers/losses](https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses) section of the documentation::
+```
+# Added after models and optimizers construction
+[netD, netG], [optimizerD, optimizerG] = amp.initialize(
+    [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
+...
+# loss.backward() changed to:
+with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
+    errD_real_scaled.backward()
+...
+with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
+    errD_fake_scaled.backward()
+...
+with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
+    errG_scaled.backward()
+```
+
+Note that we use different `loss_scalers` for each computed loss.
+Using a separate loss scaler per loss is [optional, not required](https://nvidia.github.io/apex/advanced.html#optionally-have-amp-use-a-different-loss-scaler-per-loss).
+
+To improve the numerical stability, we swapped `nn.Sigmoid() + nn.BCELoss()` to `nn.BCEWithLogitsLoss()`.
+
+With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
+
+"Pure FP32" training:
+```
+$ python main_amp.py --opt-level O0
+```
+Recommended mixed precision training:
+```
+$ python main_amp.py --opt-level O1
+```
+
+Have a look at the original [DCGAN example](https://github.com/pytorch/examples/tree/master/dcgan) for more information about the used arguments.
+
+To enable mixed precision training, we introduce the `--opt-level` argument.
--- a/examples/dcgan/main_amp.py
+++ b/examples/dcgan/main_amp.py
+from __future__ import print_function
+import argparse
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+import torch.utils.data
+import torchvision.datasets as dset
+import torchvision.transforms as transforms
+import torchvision.utils as vutils
+
+try:    
+    from apex import amp
+except ImportError:
+    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dataset', default='cifar10', help='cifar10 | lsun | mnist |imagenet | folder | lfw | fake')
+parser.add_argument('--dataroot', default='./', help='path to dataset')
+parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)
+parser.add_argument('--batchSize', type=int, default=64, help='input batch size')
+parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
+parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector')
+parser.add_argument('--ngf', type=int, default=64)
+parser.add_argument('--ndf', type=int, default=64)
+parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for')
+parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
+parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
+parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
+parser.add_argument('--netG', default='', help="path to netG (to continue training)")
+parser.add_argument('--netD', default='', help="path to netD (to continue training)")
+parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
+parser.add_argument('--manualSeed', type=int, help='manual seed')
+parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
+parser.add_argument('--opt_level', default='O1', help='amp opt_level, default="O1"')
+
+opt = parser.parse_args()
+print(opt)
+
+
+try:
+    os.makedirs(opt.outf)
+except OSError:
+    pass
+
+if opt.manualSeed is None:
+    opt.manualSeed = 2809
+print("Random Seed: ", opt.manualSeed)
+random.seed(opt.manualSeed)
+torch.manual_seed(opt.manualSeed)
+
+cudnn.benchmark = True
+
+
+if opt.dataset in ['imagenet', 'folder', 'lfw']:
+    # folder dataset
+    dataset = dset.ImageFolder(root=opt.dataroot,
+                               transform=transforms.Compose([
+                                   transforms.Resize(opt.imageSize),
+                                   transforms.CenterCrop(opt.imageSize),
+                                   transforms.ToTensor(),
+                                   transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+                               ]))
+    nc=3
+elif opt.dataset == 'lsun':
+    classes = [ c + '_train' for c in opt.classes.split(',')]
+    dataset = dset.LSUN(root=opt.dataroot, classes=classes,
+                        transform=transforms.Compose([
+                            transforms.Resize(opt.imageSize),
+                            transforms.CenterCrop(opt.imageSize),
+                            transforms.ToTensor(),
+                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+                        ]))
+    nc=3
+elif opt.dataset == 'cifar10':
+    dataset = dset.CIFAR10(root=opt.dataroot, download=True,
+                           transform=transforms.Compose([
+                               transforms.Resize(opt.imageSize),
+                               transforms.ToTensor(),
+                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+                           ]))
+    nc=3
+
+elif opt.dataset == 'mnist':
+        dataset = dset.MNIST(root=opt.dataroot, download=True,
+                           transform=transforms.Compose([
+                               transforms.Resize(opt.imageSize),
+                               transforms.ToTensor(),
+                               transforms.Normalize((0.5,), (0.5,)),
+                           ]))
+        nc=1
+
+elif opt.dataset == 'fake':
+    dataset = dset.FakeData(image_size=(3, opt.imageSize, opt.imageSize),
+                            transform=transforms.ToTensor())
+    nc=3
+
+assert dataset
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
+                                         shuffle=True, num_workers=int(opt.workers))
+
+device = torch.device("cuda:0")
+ngpu = int(opt.ngpu)
+nz = int(opt.nz)
+ngf = int(opt.ngf)
+ndf = int(opt.ndf)
+
+
+# custom weights initialization called on netG and netD
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        m.weight.data.normal_(0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        m.weight.data.normal_(1.0, 0.02)
+        m.bias.data.fill_(0)
+
+
+class Generator(nn.Module):
+    def __init__(self, ngpu):
+        super(Generator, self).__init__()
+        self.ngpu = ngpu
+        self.main = nn.Sequential(
+            # input is Z, going into a convolution
+            nn.ConvTranspose2d(     nz, ngf * 8, 4, 1, 0, bias=False),
+            nn.BatchNorm2d(ngf * 8),
+            nn.ReLU(True),
+            # state size. (ngf*8) x 4 x 4
+            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf * 4),
+            nn.ReLU(True),
+            # state size. (ngf*4) x 8 x 8
+            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf * 2),
+            nn.ReLU(True),
+            # state size. (ngf*2) x 16 x 16
+            nn.ConvTranspose2d(ngf * 2,     ngf, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ngf),
+            nn.ReLU(True),
+            # state size. (ngf) x 32 x 32
+            nn.ConvTranspose2d(    ngf,      nc, 4, 2, 1, bias=False),
+            nn.Tanh()
+            # state size. (nc) x 64 x 64
+        )
+
+    def forward(self, input):
+        if input.is_cuda and self.ngpu > 1:
+            output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
+        else:
+            output = self.main(input)
+        return output
+
+
+netG = Generator(ngpu).to(device)
+netG.apply(weights_init)
+if opt.netG != '':
+    netG.load_state_dict(torch.load(opt.netG))
+print(netG)
+
+
+class Discriminator(nn.Module):
+    def __init__(self, ngpu):
+        super(Discriminator, self).__init__()
+        self.ngpu = ngpu
+        self.main = nn.Sequential(
+            # input is (nc) x 64 x 64
+            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf) x 32 x 32
+            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf*2) x 16 x 16
+            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf*4) x 8 x 8
+            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(ndf * 8),
+            nn.LeakyReLU(0.2, inplace=True),
+            # state size. (ndf*8) x 4 x 4
+            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
+        )
+
+    def forward(self, input):
+        if input.is_cuda and self.ngpu > 1:
+            output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
+        else:
+            output = self.main(input)
+
+        return output.view(-1, 1).squeeze(1)
+
+
+netD = Discriminator(ngpu).to(device)
+netD.apply(weights_init)
+if opt.netD != '':
+    netD.load_state_dict(torch.load(opt.netD))
+print(netD)
+
+criterion = nn.BCEWithLogitsLoss()
+
+fixed_noise = torch.randn(opt.batchSize, nz, 1, 1, device=device)
+real_label = 1
+fake_label = 0
+
+# setup optimizer
+optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+
+[netD, netG], [optimizerD, optimizerG] = amp.initialize(
+    [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
+
+for epoch in range(opt.niter):
+    for i, data in enumerate(dataloader, 0):
+        ############################
+        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
+        ###########################
+        # train with real
+        netD.zero_grad()
+        real_cpu = data[0].to(device)
+        batch_size = real_cpu.size(0)
+        label = torch.full((batch_size,), real_label, device=device)
+
+        output = netD(real_cpu)
+        errD_real = criterion(output, label)
+        with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
+            errD_real_scaled.backward()
+        D_x = output.mean().item()
+
+        # train with fake
+        noise = torch.randn(batch_size, nz, 1, 1, device=device)
+        fake = netG(noise)
+        label.fill_(fake_label)
+        output = netD(fake.detach())
+        errD_fake = criterion(output, label)
+        with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
+            errD_fake_scaled.backward()
+        D_G_z1 = output.mean().item()
+        errD = errD_real + errD_fake
+        optimizerD.step()
+
+        ############################
+        # (2) Update G network: maximize log(D(G(z)))
+        ###########################
+        netG.zero_grad()
+        label.fill_(real_label)  # fake labels are real for generator cost
+        output = netD(fake)
+        errG = criterion(output, label)
+        with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
+            errG_scaled.backward()
+        D_G_z2 = output.mean().item()
+        optimizerG.step()
+
+        print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
+              % (epoch, opt.niter, i, len(dataloader),
+                 errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
+        if i % 100 == 0:
+            vutils.save_image(real_cpu,
+                    '%s/real_samples.png' % opt.outf,
+                    normalize=True)
+            fake = netG(fixed_noise)
+            vutils.save_image(fake.detach(),
+                    '%s/amp_fake_samples_epoch_%03d.png' % (opt.outf, epoch),
+                    normalize=True)
+
+    # do checkpointing
+    torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
+    torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))
+
+
--- a/requirements.txt
+++ b/requirements.txt
+cxxfilt>=0.2.0
+tqdm>=4.28.1
+numpy>=1.15.3
+PyYAML>=5.1
+pytest>=3.5.1
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,9 @@ import torch
 from setuptools import setup, find_packages
 import subprocess

+from pip._internal import main as pipmain
 import sys
+import warnings

 if not torch.cuda.is_available():
    print("\nWarning: Torch did not find available GPUs on this system.\n",
@@ -19,6 +21,17 @@ if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
 cmdclass = {}
 ext_modules = []

+if "--pyprof" in sys.argv:
+    with open('requirements.txt') as f:
+        required_packages = f.read().splitlines()
+        pipmain(["install"] + required_packages)
+    try:
+        sys.argv.remove("--pyprof")
+    except:
+        pass
+else:
+    warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")
+
 if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
    if TORCH_MAJOR == 0:
        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "

--- a/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+++ b/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
@@ -4,38 +4,39 @@ import random

 import torch
 import apex
+from torch.autograd import Variable

        
 class TestFusedLayerNorm(unittest.TestCase):
    def setUp(self):
-        self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=False)
-        self.input_ = torch.randn(16, 32, 64)
+        # bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
+        self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cpu()
+        self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cuda()
+
+    def _test_same_output(self, batch_size):
        torch.cuda.manual_seed(42)
-        
-    def forward_cpu(self, input_):
-        self.module.cpu()
-        return self.module(input_.cpu())
-    
-    def forward_cuda(self, input_):
-        self.module.cuda()
-        return self.module(input_.cuda())
-    
-    def test_forward_cuda(self):
-        out_ = self.forward_cuda(self.input_)
-        assert out_.is_cuda == True
-        
-    def test_forward_cpu(self):
-        out_ = self.forward_cpu(self.input_)
-        assert out_.is_cuda == False
-        
-    def test_same_output(self):
-        out_cpu = self.forward_cpu(self.input_)
-        out_cuda = self.forward_cuda(self.input_)
-        torch.testing.assert_allclose(out_cpu, out_cuda.cpu())
+        self.input_ = torch.randn((batch_size, *self.module_cpu_.normalized_shape), device="cpu").requires_grad_(True)
+        self.input_cuda_ = self.input_.cuda().detach().requires_grad_(True)
+        out_cpu_ = self.module_cpu_(self.input_)
+        gO = torch.rand_like(out_cpu_)
+        out_cpu_.backward(gO)
+        out_cuda_ = self.module_cuda_(self.input_cuda_)
+        gO = gO.cuda()
+        out_cuda_.backward(gO)
+        assert out_cpu_.is_cuda == False
+        assert out_cuda_.is_cuda == True
+        torch.testing.assert_allclose(out_cpu_, out_cuda_.cpu())
+        torch.testing.assert_allclose(self.input_.grad, self.input_cuda_.grad.cpu())
+
+    def test_layer_norm(self):
+        self._test_same_output(16)
+
+    def test_large_batch(self):
+        self._test_same_output(65536)
        
        
 class TestFusedLayerNormElemWise(TestFusedLayerNorm):
    def setUp(self):
-        self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=True)
-        self.input_ = torch.randn(16, 32, 64)
-        torch.cuda.manual_seed(42)
\ No newline at end of file
+        self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
+        self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()
+
--- a/tests/L0/run_pyprof_nvtx/__init__.py
+++ b/tests/L0/run_pyprof_nvtx/__init__.py
+import test_pyprof_nvtx.TestPyProfNvtx as TestPyProfNvtx
--- a/tests/L0/run_pyprof_nvtx/test_pyprof_nvtx.py
+++ b/tests/L0/run_pyprof_nvtx/test_pyprof_nvtx.py
+import inspect
+import os
+import torch
+import torch.nn.functional as F
+import unittest
+
+from apex import pyprof
+pyprof.nvtx.init()
+
+# TODO: add tests for:
+# F.bilinear, F.l1_loss, F.multilabel_soft_margin_loss, F.multi_margin_loss
+
+class TestPyProfNvtx(unittest.TestCase):
+
+    def __init__(self, testName, dtype=torch.float16):
+        super().__init__(testName) 
+        self.dtype = dtype
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def test_conv1d(self):
+        # Data and weight tensors
+        tensor1d_in_conv = torch.randn(32, 3, 224, device='cuda', dtype=self.dtype)
+        tensor1d_in_conv_grouped = torch.randn(32, 6, 224, device='cuda', dtype=self.dtype)
+        conv1d_filter = torch.randn(16, 3, 3, device='cuda', dtype=self.dtype)
+        conv1d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
+        # Vanilla conv1d
+        conv1d_out_vanilla = F.conv1d(tensor1d_in_conv, conv1d_filter)
+        # conv1d with bias
+        conv1d_out_with_bias = F.conv1d(tensor1d_in_conv, conv1d_filter, bias=conv1d_bias)
+        # conv1d - stride > 1
+        conv1d_out_strided = F.conv1d(tensor1d_in_conv, conv1d_filter, stride=2)
+        # conv1d - dilation > 1
+        conv1d_out_dilated = F.conv1d(tensor1d_in_conv, conv1d_filter, dilation=2)
+        # conv1d - groups > 1
+        conv1d_out_grouped = F.conv1d(tensor1d_in_conv_grouped, conv1d_filter, groups=2)
+        # conv1d - padding with zeros
+        conv1d_out_padding_zeros = F.conv1d(tensor1d_in_conv, conv1d_filter, padding=6)
+    
+    def test_conv2d(self):
+        # Data and weight tensors
+        tensor2d_in_conv = torch.randn(32, 3, 224, 224, device='cuda', dtype=self.dtype)
+        tensor2d_in_conv_grouped = torch.randn(32, 6, 224, 224, device='cuda', dtype=self.dtype)
+        conv2d_filter = torch.randn(16, 3, 3, 3, device='cuda', dtype=self.dtype)
+        conv2d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
+        # Vanilla conv2d
+        conv2d_out_vanilla = F.conv2d(tensor2d_in_conv, conv2d_filter)
+        # conv2d with bias
+        conv2d_with_bias = F.conv2d(tensor2d_in_conv, conv2d_filter, bias=conv2d_bias)
+        # conv2d - stride > 1
+        conv2d_out_strided = F.conv2d(tensor2d_in_conv, conv2d_filter, stride=2)
+        # conv2d - dilation > 1
+        conv2d_out_dilated = F.conv2d(tensor2d_in_conv, conv2d_filter, dilation=2)
+        # conv2d - groups > 1
+        conv2d_out_grouped = F.conv2d(tensor2d_in_conv_grouped, conv2d_filter, groups=2)
+        # conv2d - padding with zeros
+        conv2d_out_padding_zeros = F.conv2d(tensor2d_in_conv, conv2d_filter, padding=6)
+    
+    
+    def test_conv3d(self):
+        # Data and weight tensors
+        tensor3d_in_conv = torch.randn(32, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
+        tensor3d_in_conv_grouped = torch.randn(32, 6, 16, 224, 224, device='cuda', dtype=self.dtype)
+        conv3d_filter = torch.randn(16, 3, 3, 3, 3, device='cuda', dtype=self.dtype)
+        conv3d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
+        # Vanilla conv3d
+        conv3d_out_vanilla = F.conv3d(tensor3d_in_conv, conv3d_filter)
+        # conv3d - stride > 1
+        conv3d_out_strided = F.conv3d(tensor3d_in_conv, conv3d_filter, stride=2)
+        # conv3d - dilation > 1
+        conv3d_out_dilated = F.conv3d(tensor3d_in_conv, conv3d_filter, dilation=2)
+        # conv3d - groups > 1
+        conv3d_out_grouped = F.conv3d(tensor3d_in_conv_grouped, conv3d_filter, groups=2)
+        # conv3d - padding with zeros
+        conv3d_out_padding_zeros = F.conv3d(tensor3d_in_conv, conv3d_filter, padding=6)
+    
+    def test_conv_transpose1d(self):
+        # Data and weight tensors
+        conv_transpose1d_tensor = torch.randn(64, 16, 64, device='cuda', dtype=self.dtype)
+        conv_transpose1d_filter = torch.randn(16, 32, 3, device='cuda', dtype=self.dtype)
+        conv_transpose1d_bias = torch.randn(32, device='cuda', dtype=self.dtype)
+        # Conv transpose runs
+        conv_transpose1d_out = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter)
+        conv_transpose1d_out_biased = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, bias=conv_transpose1d_bias)
+        conv_transpose1d_out_strided = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, stride=2)
+        conv_transpose1d_out_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, padding=3)
+        conv_transpose1d_out2_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, output_padding=2, dilation=3)
+        conv_transpose1d_out_grouped = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, groups=2)
+        conv_transpose1d_out_dilated = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, dilation=2)
+    
+    
+    def test_conv_transpose2d(self):
+        # Data and weight tensors
+        conv_transpose2d_tensor = torch.randn(64, 8, 5, 5, device='cuda', dtype=self.dtype)
+        conv_transpose2d_filter = torch.randn(8, 16, 3, 3, device='cuda', dtype=self.dtype)
+        conv_transpose2d_bias = torch.randn(16, device='cuda', dtype=self.dtype)
+        # Conv transpose runs
+        conv_transpose2d_out = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter)
+        conv_transpose2d_out_biased = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, bias=conv_transpose2d_bias)
+        conv_transpose2d_out_strided = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, stride=2)
+        conv_transpose2d_out_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, padding=3)
+        conv_transpose2d_out2_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, output_padding=2, dilation=3)
+        conv_transpose2d_out_grouped = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, groups=2)
+        conv_transpose2d_out_dilated = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, dilation=2)
+    
+    def test_conv_transpose3d(self):
+        # Data and weight tensors
+        conv_transpose3d_tensor = torch.randn(20, 16, 50, 10, 20, device='cuda', dtype=self.dtype)
+        conv_transpose3d_filter = torch.randn(16, 33, 3, 3, 3, device='cuda', dtype=self.dtype)
+        conv_transpose3d_bias = torch.randn(33, device='cuda', dtype=self.dtype)
+        # Conv transpose runs
+        conv_transpose3d_out = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter)
+        conv_transpose3d_out_biased = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, bias=conv_transpose3d_bias)
+        conv_transpose3d_out_strided = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, stride=2)
+        conv_transpose3d_out_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, padding=3)
+        conv_transpose3d_out2_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, output_padding=2, dilation=3)
+        conv_transpose3d_out_grouped = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, groups=2)
+        conv_transpose3d_out_dilated = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, dilation=2)
+    
+    def test_unfold(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        kernel_size = (4, 5)
+        inp_unf_dilated = F.unfold(inp, kernel_size, dilation=2)
+        inp_unf_padded = F.unfold(inp, kernel_size, padding=2)
+        inp_unf_strided = F.unfold(inp, kernel_size, stride=2)
+    
+    def test_fold(self):
+        inp = torch.randn(3, 20, 20, device='cuda', dtype=self.dtype)
+        inp_folded = F.fold(inp, (4, 5), (1, 1))
+    
+    def test_avg_pool1d(self):
+        inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
+        out = F.avg_pool1d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
+    
+    def test_avg_pool2d(self):
+        inp = torch.randn(1, 3, 224, 224, device='cuda', dtype=self.dtype)
+        out = F.avg_pool2d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
+    
+    def test_avg_pool3d(self):
+        inp = torch.randn(1, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
+        out = F.avg_pool3d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
+    
+    def test_adaptive_avg_pool1d(self):
+        inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
+        out = F.adaptive_avg_pool1d(inp, output_size=5) 
+    
+    def test_adaptive_avg_pool2d(self):
+        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
+        out = F.adaptive_avg_pool2d(inp, output_size=5) 
+    
+    def test_adaptive_avg_pool3d(self):
+        inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
+        out = F.adaptive_avg_pool3d(inp, output_size=5) 
+    
+    def test_max_pool1d(self):
+        inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
+        out = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
+    
+    def test_max_pool2d(self):
+        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
+        out = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
+    
+    def test_max_pool3d(self):
+        inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
+        out = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
+    
+    def test_adaptive_max_pool1d(self):
+        inp = torch.randn(1, 16, 28, device='cuda', dtype=self.dtype)
+        out = F.adaptive_max_pool1d(inp, output_size=5, return_indices=True) 
+    
+    def test_adaptive_max_pool2d(self):
+        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
+        out = F.adaptive_max_pool2d(inp, output_size=5, return_indices=True) 
+    
+    def test_adaptive_max_pool3d(self):
+        inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
+        out = F.adaptive_max_pool3d(inp, output_size=5, return_indices=True) 
+    
+    def test_max_unpool1d(self):
+        inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
+        output, indices = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
+        output = F.max_unpool1d(output, indices, kernel_size=2, stride=2, padding=2)
+    
+    def test_max_unpool2d(self):
+        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
+        output, indices = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
+        output = F.max_unpool2d(output, indices, kernel_size=2, stride=2, padding=2)
+    
+    def test_max_unpool3d(self):
+        inp = torch.randn(1, 16, 8, 32, 32, device='cuda', dtype=self.dtype)
+        output, indices = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
+        output = F.max_unpool3d(output, indices, kernel_size=2, stride=2, padding=2)
+    
+    def test_lp_pool1d(self):
+        inp = torch.randn(1, 32, 64, device='cuda', dtype=self.dtype)
+        output = F.lp_pool1d(inp, 2, 3, stride=2, ceil_mode=True)
+    
+    def test_lp_pool2d(self):
+        #torch.nn.LPPool2d(norm_type, kernel_size, stride=None, ceil_mode=False)
+        inp = torch.randn(1, 32, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.lp_pool2d(inp, 2, 3, stride=2, ceil_mode=True)
+    
+    def test_threshold(self):
+        inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.threshold(inp, 6, 6, inplace=False)
+    
+    def test_threshold_(self):
+        inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.threshold_(inp, 6, 6)
+    
+    def test_relu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.relu(inp, inplace=False)
+    
+    def test_relu_(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.relu_(inp)
+    
+    def test_hardtanh(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.hardtanh(inp, min_val=-1., max_val=1., inplace=False)
+    
+    def test_hardtanh_(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.hardtanh_(inp, min_val=-1., max_val=1.)
+    
+    def test_relu6(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.relu6(inp, inplace=False)
+    
+    def test_elu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.elu(inp, alpha=1.0, inplace=False)
+    
+    def test_elu_(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.elu_(inp, alpha=1.0)
+    
+    def test_selu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.selu(inp)
+    
+    def test_celu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.celu(inp, alpha=1.0, inplace=False)
+    
+    def test_leaky_relu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.leaky_relu(inp, negative_slope=0.01, inplace=False)
+    
+    def test_leaky_relu_(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.leaky_relu_(inp, negative_slope=0.01)
+    
+    def test_prelu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        weight = torch.randn(1, device='cuda', dtype=self.dtype)
+        output = F.prelu(inp, weight)
+    
+    def test_rrelu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.rrelu(inp, lower=1./8, upper=1./3, training=False, inplace=False)
+    
+    def test_rrelu_(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.rrelu(inp, lower=1./8, upper=1./3, training=False)
+    
+    def test_glu(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.glu(inp, dim=-1)
+    
+    def test_logsigmoid(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.logsigmoid(inp)
+    
+    def test_hardshrink(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.hardshrink(inp, lambd=0.5)
+    
+    def test_tanhshrink(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.tanhshrink(inp)
+    
+    def test_softsign(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.softsign(inp)
+    
+    def test_softplus(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.softplus(inp, beta=1, threshold=20)
+    
+    def test_softmin(self):
+        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
+        output = F.softmin(inp, dim=1,  _stacklevel=3, dtype=self.dtype)
+    
+    def test_softmax(self):
+        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
+        output = F.softmax(inp, dim=1, _stacklevel=3, dtype=self.dtype)
+    
+    def test_softshrink(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.softshrink(inp, lambd=0.5)
+    
+    def test_gumbel_softmax(self):
+        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
+        output = F.gumbel_softmax(inp, tau=1, hard=False, eps=1e-10, dim=-1)
+    
+    def test_log_softmax(self):
+        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
+        output = F.log_softmax(inp, dim=-1, _stacklevel=3)
+    
+    def test_tanh(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = torch.tanh(inp)
+    
+    def test_sigmoid(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = torch.sigmoid(inp)
+    
+    def test_batch_norm(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        # running_mean, running_var
+        running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
+        running_var = torch.randn(3, device='cuda', dtype=self.dtype)
+        output = F.batch_norm(inp, running_mean, running_var, weight=None, bias=None, training=False, momentum=0.1, eps=1e-05)
+    
+    def test_instance_norm(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
+        running_var = torch.randn(3, device='cuda', dtype=self.dtype)
+        output = F.instance_norm(inp, running_mean=running_mean, running_var=running_var, weight=None, bias=None, use_input_stats=True, momentum=0.1, eps=1e-05)
+    
+    def test_layer_norm(self):
+        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
+        output = F.layer_norm(inp, inp.size()[1:], weight=None, bias=None, eps=1e-05)
+    
+    def test_local_response_norm(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.local_response_norm(inp, 2, alpha=0.0001, beta=0.75, k=1.0)
+    
+    def test_normalize(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.normalize(inp, p=2, dim=1, eps=1e-12, out=None)
+    
+    def test_linear(self):
+        inp = torch.randn(32, 64, 128, device='cuda', dtype=self.dtype)
+        weight = torch.randn(256, 128, device='cuda', dtype=self.dtype)
+        output = F.linear(inp, weight, bias=None)
+    
+    def test_dropout(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.dropout(inp, p=0.5, training=True, inplace=False)
+    
+    def test_alpha_dropout(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.alpha_dropout(inp, p=0.5, training=True, inplace=False)
+    
+    def test_dropout2d(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.dropout2d(inp, p=0.5, training=True, inplace=False)
+    
+    def test_dropout3d(self):
+        inp = torch.randn(16, 8, 32, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.dropout3d(inp, p=0.5, training=True, inplace=False)
+    
+    def test_embedding(self):
+        pre_embed_dim = 1024
+        post_embed_dim = 32
+        inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')    
+        weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
+        output = F.embedding(inp, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False)
+    
+    def test_embedding_bag(self):
+        pre_embed_dim = 1024
+        post_embed_dim = 32
+        inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')    
+        weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
+        output = F.embedding_bag(inp, weight, offsets=None, max_norm=None, norm_type=2,
+            scale_grad_by_freq=False, mode='mean', sparse=False)
+    
+    def test_one_hot(self):
+        num_classes = 10
+        inp = torch.randint(0, num_classes, (128, 16), device='cuda')    
+        output = F.one_hot(inp, num_classes=10) 
+    
+    def test_pairwise_distance(self):
+        inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
+        inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
+        output = F.pairwise_distance(inp1, inp2, p=2.0, eps=1e-06, keepdim=False) 
+    
+    def test_cosine_similarity(self):
+        inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
+        inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
+        output = F.cosine_similarity(inp1, inp2, dim=1, eps=1e-8)
+    
+    def test_pdist(self):
+        # pdist is not implemented for fp16
+        inp = torch.randn(128, 128, device='cuda', dtype=torch.float32)
+        output = F.pdist(inp, p=2)
+    
+    def test_binary_cross_entropy(self):
+        # binary_cross_entropy is not implemented for fp16
+        inp = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=True)
+        target = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=False)
+        output = F.binary_cross_entropy(torch.sigmoid(inp), target)
+    
+    def test_binary_cross_entropy_with_logits(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.empty_like(inp).random_(2)
+        output = F.binary_cross_entropy_with_logits(inp, target)
+    
+    def test_poisson_nll_loss(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
+        output = F.poisson_nll_loss(inp, target, log_input=True, full=False,
+            size_average=None, eps=1e-08, reduce=None, reduction='mean')
+    
+    def test_cosine_embedding_loss(self):
+        inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randn(32, device='cuda', dtype=self.dtype, requires_grad=False)
+        output = F.cosine_embedding_loss(inp1, inp2, target, margin=0,
+            size_average=None, reduce=None, reduction='mean')
+    
+    def test_cross_entropy(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randint(0, 100, (32,), device='cuda', dtype=torch.long, requires_grad=False)
+        output = F.cross_entropy(inp, target, weight=None, size_average=None,
+            ignore_index=-100, reduce=None, reduction='mean')
+    
+    def test_ctc_loss(self):
+        # force fp32 because _th_normal_ (used by next line is not supported for fp16)
+        log_probs = torch.randn(50, 16, 20, device='cuda', dtype=torch.float32).log_softmax(2).detach().requires_grad_()
+        targets = torch.randint(1, 20, (16, 30), device='cuda', dtype=torch.long)
+        input_lengths = torch.full((16,), 50, dtype=torch.long)
+        target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
+        loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+    
+    def test_hinge_embedding_loss(self):
+        inp = torch.randn(128, 32, device='cuda', dtype=self.dtype)
+        target = torch.randint(0, 1, (32,), device='cuda') - 1
+        output = F.hinge_embedding_loss(inp, target, margin=1.0, size_average=None, reduce=None, reduction='mean') 
+    
+    def test_kl_div(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        output = F.kl_div(inp, target, size_average=None, reduce=None, reduction='batchmean')
+    
+    def test_mse_loss(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        output = F.mse_loss(inp, target, size_average=None, reduce=None, reduction='mean')
+    
+    def test_margin_ranking_loss(self):
+        inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = (torch.randint(0, 1, (128,), device='cuda') - 1).type_as(inp1)
+        output = F.margin_ranking_loss(inp1, inp2, target, margin=0, size_average=None, reduce=None, reduction='mean')
+    
+    def test_multilabel_margin_loss(self):
+        inp = torch.randn(1024, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randint(0, 10, (1024,), dtype=torch.long, device='cuda')
+        output = F.multilabel_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
+    
+    def test_nll_loss(self):
+        inp = torch.randn(64, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randint(0, 10, (64,), device='cuda', dtype=torch.long) 
+        output = F.nll_loss(inp, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')
+    
+    def test_smooth_l1_loss(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
+        output = F.smooth_l1_loss(inp, target, size_average=None, reduce=None, reduction='mean')
+    
+    def test_soft_margin_loss(self):
+        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
+        output = F.soft_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean') 
+    
+    def test_triplet_margin_loss(self):
+        inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        inp3 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
+        output = F.triplet_margin_loss(inp1, inp2, inp3, margin=1.0, p=2,
+             eps=1e-06, swap=False, size_average=None, reduce=None, reduction='mean')
+    
+    def test_pixel_shuffle(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = torch.nn.functional.pixel_shuffle(inp, 2)
+    
+    def test_pad(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        pad = (3, 3)
+        output = F.pad(inp, pad, mode='constant', value=0)
+    
+    def test_interpolate(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        output = F.interpolate(inp, size=None, scale_factor=2, mode='nearest', align_corners=None)
+    
+    def test_grid_sample(self):
+        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
+        grid = torch.randn(16, 32, 32, 2, device='cuda', dtype=self.dtype)
+        output = F.grid_sample(inp, grid, mode='bilinear', padding_mode='zeros')
+    
+    def test_affine_grid(self):
+        theta = torch.randn(32, 2, 3, device='cuda', dtype=self.dtype)
+        size = (32, 8, 32, 32)
+        output = F.affine_grid(theta, size)
+
+
+def run_tests(precision):
+    dummy = TestPyProfNvtx('test_affine_grid', None)
+    test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
+    print("Running tests for {}".format(precision))
+    suite = unittest.TestSuite()
+    for test_case in test_cases:
+        suite.addTest(TestPyProfNvtx(test_case, precision))
+    unittest.TextTestRunner().run(suite)
+
+if __name__ == '__main__':
+    run_tests(torch.float32)
+    run_tests(torch.float16)
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
 import unittest
 import sys

-test_dirs = ["run_amp", "run_fp16util", "run_fused_layer_norm", "run_optimizers"]
+test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx"]

 runner = unittest.TextTestRunner(verbosity=2)