Remove `pyprof` and `reparameterization` (#1404)

* remove pyprof * remove reparameterization * remove pyprof test * clean up

Remove `pyprof` and `reparameterization` (#1404)
* remove pyprof * remove reparameterization * remove pyprof test * clean up
8a7a3325 · Masaki Kozuki · GitHub · cd499737 · cd499737 · cd499737
Unverified Commit 8a7a3325 authored Jul 06, 2022 by Masaki Kozuki Committed by GitHub Jul 06, 2022
13 changed files
--- a/apex/pyprof/prof/usage.py
+++ b/apex/pyprof/prof/usage.py
-import sys
-import argparse
-
-def parseArgs():
-	"""
-	Print usage and parse arguments.
-	"""
-
-	def check_cols(value):
-		valid = ["idx", "seq", "altseq", "tid", "layer", "trace", "dir", "sub", "mod", "op", "kernel", "params", "sil", "tc", "device", "stream", "grid", "block", "flops", "bytes"]
-		cols = value.split(",")
-		for col in cols:
-			if col not in valid:
-				raise argparse.ArgumentTypeError("{} is not a valid column name. Valid column names are {}.".format(col, ",".join(valid)))
-		return cols
-
-	def openFile(f):
-		try:
-			d = open(f, "r")
-			return d
-		except IOError:
-			print("Error opening file {}. Exiting.".format(f), file=sys.stderr)
-			sys.exit(1)
-
-	parser = argparse.ArgumentParser(prog=sys.argv[0], description="PyTorch Profiler", formatter_class=argparse.RawTextHelpFormatter)
-	parser.add_argument("file",
-		nargs='?',
-		type=str,
-		default=None,
-		help="Output of parse.py (Python dictionary).")
-
-	parser.add_argument("-c",
-		type=check_cols,
-		default="idx,dir,sub,mod,op,kernel,params,sil",
-		help='''Comma seperated names of columns to print.
-idx:      Index
-seq:      PyTorch Sequence Id
-altseq:   PyTorch Alternate Sequence Id
-tid:      Thread Id
-layer:    User annotated NVTX string (can be nested)
-trace:    Function Call Trace
-dir:      Direction
-sub:      Sub Sequence Id
-mod:      Module
-op:       Operattion
-kernel:   Kernel Name
-params:   Parameters
-sil:      Silicon Time (in ns)
-tc:       Tensor Core Usage
-device:   GPU Device Id
-stream:   Stream Id
-grid:     Grid Dimensions
-block:    Block Dimensions
-flops:    Floating point ops (FMA = 2 FLOPs)
-bytes:    Number of bytes in and out of DRAM
-e.g. -c idx,kernel,sil''')
-
-	group = parser.add_mutually_exclusive_group()
-	group.add_argument("--csv",
-		action="store_true",
-		default=False,
-		help="Print a CSV output.")
-	group.add_argument("-w",
-		type=int,
-		default=0,
-		help="Width of columnated output.")
-
-	args = parser.parse_args()
-	if args.file is None:
-		args.file = sys.stdin
-	else:
-		args.file = openFile(args.file)
-	return args
--- a/apex/pyprof/prof/utility.py
+++ b/apex/pyprof/prof/utility.py
-from functools import reduce
-
-class Utility(object):
-
-	@staticmethod
-	def numElems(shape):
-		assert (type(shape) == tuple)
-		return reduce(lambda x,y: x*y, shape, 1)
-
-	@staticmethod
-	def typeToBytes(t):
-		if (t in ["uint8", "int8", "byte", "char", "bool"]):
-			return 1
-		elif (t in ["float16", "half", "int16", "short"]):
-			return 2
-		elif (t in ["float32", "float", "int32", "int"]):
-			return 4
-		elif (t in ["int64", "long", "float64", "double"]):
-			return 8
-		assert False
-
-	@staticmethod
-	def typeToString(t):
-		if (t in ["uint8", "byte", "char",]):
-			return "uint8"
-		elif (t in ["int8",]):
-			return "int8"
-		elif (t in ["int16", "short",]):
-			return "int16"
-		elif (t in ["float16", "half"]):
-			return "fp16"
-		elif (t in ["float32", "float"]):
-			return "fp32"
-		elif (t in ["int32", "int",]):
-			return "int32"
-		elif (t in ["int64", "long"]):
-			return "int64"
-		elif (t in ["float64", "double",]):
-			return "fp64"
-		elif (t in ["bool",]):
-			return "bool"
-		assert False
-
-	@staticmethod
-	def hasNVTX(marker):
-		if type(marker) is str:
-			try:
-				marker = eval(marker)
-			except:
-				return False
-
-		if type(marker) is dict:
-			keys  = marker.keys()
-			return ("mod" in keys) and ("op" in keys) and ("args" in keys)
-		else:
-			return False
-
-	@staticmethod
-	def isscalar(t):
-		return (t in ["float", "int"])
--- a/apex/reparameterization/README.md
+++ b/apex/reparameterization/README.md
-Under construction...
--- a/apex/reparameterization/__init__.py
+++ b/apex/reparameterization/__init__.py
-import warnings
-warnings.warn("reparameterization will be removed by the end of June, 2022", FutureWarning)
-
-from .weight_norm import WeightNorm
-from .reparameterization import Reparameterization
-
-def apply_weight_norm(module, name='', dim=0, hook_child=True):
-    r"""
-    Applies weight normalization to a parameter in the given module.
-    If no parameter is provided, applies weight normalization to all
-    parameters in model (except 1-d vectors and scalars).
-
-    .. math::
-         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
-
-    Weight normalization is a reparameterization that decouples the magnitude
-    of a weight tensor from its direction. This replaces the parameter specified
-    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
-    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
-    Weight normalization is implemented via a hook that recomputes the weight
-    tensor from the magnitude and direction before every :meth:`~Module.forward`
-    call.
-
-    By default, with `dim=0`, the norm is computed independently per output
-    channel/plane. To compute a norm over the entire weight tensor, use
-    `dim=None`.
-
-    See https://arxiv.org/abs/1602.07868
-
-    Args:
-        module (nn.Module): containing module
-        name (str, optional): name of weight parameter
-        dim (int, optional): dimension over which to compute the norm
-        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
-            parameters. If False, it's added to `module` instead. Default: True
-
-    Returns:
-        The original module with the weight norm hook
-
-    Example::
-
-        >>> m = apply_weight_norm(nn.Linear(20, 40), name='weight')
-        Linear (20 -> 40)
-        >>> m.weight_g.size()
-        torch.Size([40, 1])
-        >>> m.weight_v.size()
-        torch.Size([40, 20])
-
-    """
-    return apply_reparameterization(module, reparameterization=WeightNorm, hook_child=hook_child,
-                                    name=name, dim=dim)
-
-def remove_weight_norm(module, name='', remove_all=False):
-    """
-    Removes the weight normalization reparameterization of a parameter from a module.
-    If no parameter is supplied then all weight norm parameterizations are removed.
-    Args:
-        module (nn.Module): containing module
-        name (str, optional): name of weight parameter
-    Example:
-        >>> m = apply_weight_norm(nn.Linear(20, 40))
-        >>> remove_weight_norm(m)
-    """
-    return remove_reparameterization(module, reparameterization=WeightNorm,
-                                    name=name, remove_all=remove_all)
-
-def apply_reparameterization(module, reparameterization=None, name='', dim=0, hook_child=True):
-    """
-    Applies a given weight reparameterization (such as weight normalization) to
-    a parameter in the given module. If no parameter is given, applies the reparameterization
-    to all parameters in model (except 1-d vectors and scalars).
-
-    Args:
-        module (nn.Module): containing module
-        reparameterization (Reparameterization): reparamaterization class to apply
-        name (str, optional): name of weight parameter
-        dim (int, optional): dimension over which to perform reparameterization op
-        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
-            parameters. If False, it's added to `module` instead. Default: True
-
-    Returns:
-        The original module with the reparameterization hook
-
-    Example::
-
-        >>> m = apply_reparameterization(nn.Linear(20, 40), WeightNorm)
-        Linear (20 -> 40)
-
-    """
-    assert reparameterization is not None
-    if name != '':
-        Reparameterization.apply(module, name, dim, reparameterization, hook_child)
-    else:
-        names = list(module.state_dict().keys())
-        for name in names:
-            apply_reparameterization(module, reparameterization, name, dim, hook_child)
-    return module
-
-def remove_reparameterization(module, reparameterization=Reparameterization,
-                                name='', remove_all=False):
-    """
-    Removes the given reparameterization of a parameter from a module.
-    If no parameter is supplied then all reparameterizations are removed.
-    Args:
-        module (nn.Module): containing module
-        reparameterization (Reparameterization): reparamaterization class to apply
-        name (str, optional): name of weight parameter
-        remove_all (bool, optional): if True, remove all reparamaterizations of given type. Default: False
-    Example:
-        >>> m = apply_reparameterization(nn.Linear(20, 40),WeightNorm)
-        >>> remove_reparameterization(m)
-    """
-    if name != '' or remove_all:
-        to_remove = []
-        for k, hook in module._forward_pre_hooks.items():
-            if isinstance(hook, reparameterization) and (hook.name == name or remove_all):
-                hook.remove(module)
-                to_remove.append(k)
-        if len(to_remove) > 0:
-            for k in to_remove:
-                del module._forward_pre_hooks[k]
-            return module
-        if not remove_all:
-            raise ValueError("reparameterization of '{}' not found in {}"
-                             .format(name, module))
-    else:
-        modules = [module]+[x for x in module.modules()]
-        for m in modules:
-            remove_reparameterization(m, reparameterization=reparameterization, remove_all=True)
-        return module
--- a/apex/reparameterization/reparameterization.py
+++ b/apex/reparameterization/reparameterization.py
-import torch
-from torch.nn.parameter import Parameter
-import sys
-class Reparameterization(object):
-    """
-    Class interface for performing weight reparameterizations
-    Arguments:
-        name (str): name of weight parameter
-        dim (int): dimension over which to compute the norm
-        module (nn.Module): parent module to which param `name` is registered to
-        retain_forward (bool, optional): if False deletes weight on call to 
-            module.backward. Used to avoid memory leaks with DataParallel Default: True
-    Attributes:
-        reparameterization_names (list, str): contains names of all parameters 
-            needed to compute reparameterization.
-        backward_hook_key (int): torch.utils.hooks.RemovableHandle.id for hook used in module backward pass.
-    """
-
-    def __init__(self, name, dim, module, retain_forward=True):
-        self.name = name
-        self.dim = dim
-        self.evaluated = False
-        self.retain_forward = retain_forward
-        self.reparameterization_names = []
-        self.backward_hook_key = None
-        self.module = module
-
-    def compute_weight(self, module=None, name=None):
-        """
-        Computes reparameterized weight value to assign value to module attribute
-        with name `name`.
-        See WeightNorm class for example.
-        Arguments:
-            module (nn.Module): module with weight we'd like to reparameterize
-        Returns:
-            w (Tensor): Tensor object containing value of reparameterized weight
-        """
-        raise NotImplementedError
-
-    def reparameterize(self, name, weight, dim):
-        """
-        Creates Parameters to be used for reparameterization and creates names that
-        for attributes for the module these Parameters will correspond to.
-        The parameters will be registered according to the names provided.
-        See WeightNorm class for example.
-        Arguments:
-            module (nn.Module): module with weight we'd like to reparameterize
-            name (str, optional): name of weight parameter
-            dim (int, optional): dimension over which to compute parameterization
-        Returns:
-            names (list, str): names of Parameters to be used for reparameterization
-            params (list, Parameter): Parameters to be used for reparameterization
-        """
-        raise NotImplementedError
-
-    @staticmethod
-    def apply(module, name, dim, reparameterization=None, hook_child=True):
-        """
-        Applies reparametrization to module's `name` parameter and modifies instance attributes as appropriate.
-        `hook_child` adds reparameterization hook to direct parent of the parameters. If False, it's added to `module` instead.
-        """
-        if reparameterization is None:
-            reparameterization = Reparameterization
-        module2use, name2use = Reparameterization.get_module_and_name(module, name)
-        # does not work on sparse
-        if name2use is None or isinstance(module2use, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
-            return
-
-        if hook_child:
-            fn = reparameterization(name2use, dim, module2use)
-        else:
-            fn = reparameterization(name, dim, module)
-
-        weight = getattr(module2use, name2use)
-        if weight.dim() <= 1:
-            return
-
-        # remove weight from parameter list
-        del module2use._parameters[name2use]
-
-        # add parameters of reparameterization of parameter to module
-        names, params = fn.reparameterize(name2use, weight, dim)
-        for n, p in zip(names, params):
-            module2use.register_parameter(n, p)
-
-        # add parameters to reparameterization so they can be removed later
-        fn.reparameterization_names = names
-
-        setattr(module2use, name2use, None)
-
-        hook_module = module2use
-        if not hook_child:
-            hook_module = module
-        # recompute weight before every forward()
-        hook_module.register_forward_pre_hook(fn)
-
-        # remove weight during backward
-        handle = hook_module.register_backward_hook(fn.backward_hook)
-        # get hook key so we can delete it later
-        fn.backward_hook_key = handle.id
-
-        return fn
-
-    @staticmethod
-    def get_module_and_name(module, name):
-        """
-        recursively fetches (possible) child module and name of weight to be reparameterized
-        """
-        name2use = None
-        module2use = None
-        names = name.split('.')
-        if len(names) == 1 and names[0] != '':
-            name2use = names[0]
-            module2use = module
-        elif len(names) > 1:
-            module2use = module
-            name2use = names[0]
-            for i in range(len(names)-1):
-                module2use = getattr(module2use, name2use)
-                name2use = names[i+1]
-        return module2use, name2use
-
-    def get_params(self, module):
-        """gets params of reparameterization based on known attribute names"""
-        return [getattr(module, n) for n in self.reparameterization_names]
-
-    def remove(self, module):
-        """removes reparameterization and backward hook (does not remove forward hook)"""
-        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
-        for p in self.get_params(module2use):
-            p.requires_grad = False
-        weight = self.compute_weight(module2use, name2use)
-        delattr(module2use, name2use)
-        for n in self.reparameterization_names:
-            del module2use._parameters[n]
-        module2use.register_parameter(name2use, Parameter(weight.data))
-        del module._backward_hooks[self.backward_hook_key]
-
-    def __call__(self, module, inputs):
-        """callable hook for forward pass"""
-        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
-        _w = getattr(module2use, name2use)
-        if not self.evaluated or _w is None:
-            setattr(module2use, name2use, self.compute_weight(module2use, name2use))
-            self.evaluated = True
-
-    def backward_hook(self, module, grad_input, grad_output):
-        """callable hook for backward pass"""
-        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
-        wn = getattr(module2use, name2use)
-        self.evaluated = False
--- a/apex/reparameterization/weight_norm.py
+++ b/apex/reparameterization/weight_norm.py
-import torch
-from torch.nn.parameter import Parameter
-from ..fp16_utils import Fused_Weight_Norm
-import time
-
-from .reparameterization import Reparameterization
-
-def _norm(p, dim):
-    """Computes the norm over all dimensions except dim"""
-    if dim is None:
-        return p.norm()
-    elif dim == 0:
-        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
-        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
-    elif dim == p.dim() - 1:
-        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
-        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
-    return _norm(p.transpose(0, dim), 0).transpose(0, dim)
-
-HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor)
-
-class WeightNorm(Reparameterization):
-    r"""
-    Weight normalization is a reparameterization that decouples the magnitude
-    of a weight tensor from its direction. This replaces the parameter specified
-    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
-    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
-    Weight normalization is implemented via a hook that recomputes the weight
-    tensor from the magnitude and direction before every :meth:`~Module.forward`
-    call.
-
-    .. math::
-         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
-
-    By default, with `dim=0`, the norm is computed independently per output
-    channel/plane. To compute a norm over the entire weight tensor, use
-    `dim=None`.
-    """
-    def compute_weight(self, module=None, name=None):
-        """
-        Computes weight normalized weight value to assign value to module attribute
-        with name `name`.
-        Arguments:
-            module (nn.Module): module with weight we'd like to reparameterize
-        Returns:
-            w (Tensor): Tensor object containing value of reparameterized weight
-        """
-        if module is None:
-            module = self.module
-        if name is None:
-            name = self.name
-        module, name = Reparameterization.get_module_and_name(module, name)
-        g = getattr(module, name + '_g')
-        v = getattr(module, name + '_v')
-
-        fused_weight_norm = Fused_Weight_Norm.apply
-        v = v.contiguous()
-        w = fused_weight_norm(v, g, self.dim)
-
-        return w
-
-    def reparameterize(self, name, weight, dim):
-        """
-        Creates Parameters v and gto be used for weight normalization
-        and creates names that for attributes for the module these Parameters
-        will correspond to. The parameters will be registered according to the names
-        provided.
-        Arguments:
-            module (nn.Module): module with weight we'd like to reparameterize
-            name (str, optional): name of weight parameter
-            dim (int, optional): dimension over which to compute parameterization
-        Returns:
-            names (list, str): names of Parameters to be used for reparameterization
-            params (list, Parameter): Parameters to be used for reparameterization
-        """
-        names = [name + '_g', name + '_v']
-        params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)]
-        return names, params
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -44,7 +44,6 @@ Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, c
     :caption: Deprecated mixed precision API
     fp16_util

-..   reparameterization
 ..   RNN
   
 Indices and tables

--- a/setup.py
+++ b/setup.py
@@ -105,20 +105,6 @@ cmdclass = {}
 ext_modules = []

 extras = {}
-if "--pyprof" in sys.argv:
-    string = (
-        "\n\nPyprof has been moved to its own dedicated repository and will "
-        "soon be removed from Apex.  Please visit\n"
-        "https://github.com/NVIDIA/PyProf\n"
-        "for the latest version."
-    )
-    warnings.warn(string, DeprecationWarning)
-    with open("requirements.txt") as f:
-        required_packages = f.read().splitlines()
-        extras["pyprof"] = required_packages
-    sys.argv.remove("--pyprof")
-else:
-    warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")

 if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
    if TORCH_MAJOR == 0:

--- a/tests/L0/run_pyprof_data/__init__.py
+++ b/tests/L0/run_pyprof_data/__init__.py
--- a/tests/L0/run_pyprof_data/test_pyprof_data.py
+++ b/tests/L0/run_pyprof_data/test_pyprof_data.py
-import inspect
-import unittest
-
-from apex.pyprof.prof.data import Data
-from apex.pyprof.prof.prof import foo
-
-
-class TestPyProfData(unittest.TestCase):
-
-	def __init__(self, testName):
-		super().__init__(testName)
-
-	def setUp(self):
-		pass
-
-	def tearDown(self):
-		pass
-
-	def test_data(self):
-		kernels = [
-			{'kShortName': 'elementwise_kernel', 'kDuration': 2848, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'float', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 104, 160), 'dtype': 'bool'}]}"], 'seqMarker': ['to, seq = 60471'], 'seqId': [60471], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['float'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (585, 1, 1), 'block': (512, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<512, 1, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1})'},
-			{'kShortName': 'elementwise_kernel', 'kDuration': 201182, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'clone', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 4, 416, 640), 'dtype': 'float32'}]}"], 'seqMarker': ['clone, seq = 60161'], 'seqId': [60161], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['clone'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (37440, 1, 1), 'block': (128, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<128, 4, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2})'},
-		]
-
-		for k in kernels:
-			d = Data(k)
-			mod = k['mod']
-			op = k['op']
-			xx = foo(mod, op, d)
-			d.setParams(xx.params())
-
-
-def run_tests(test_name):
-	dummy = TestPyProfData(test_name)
-	test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
-	print(f'Running tests for {test_name}')
-	suite = unittest.TestSuite()
-	for test_case in test_cases:
-		suite.addTest(TestPyProfData(test_case))
-	unittest.TextTestRunner().run(suite)
-
-if __name__ == '__main__':
-	run_tests('test_data')
--- a/tests/L0/run_pyprof_nvtx/__init__.py
+++ b/tests/L0/run_pyprof_nvtx/__init__.py
-import test_pyprof_nvtx.TestPyProfNvtx as TestPyProfNvtx
--- a/tests/L0/run_pyprof_nvtx/test_pyprof_nvtx.py
+++ b/tests/L0/run_pyprof_nvtx/test_pyprof_nvtx.py
-import inspect
-import os
-import torch
-import torch.nn.functional as F
-import unittest
-
-from apex import pyprof
-pyprof.nvtx.init()
-
-# TODO: add tests for:
-# F.bilinear, F.l1_loss, F.multilabel_soft_margin_loss, F.multi_margin_loss
-
-class TestPyProfNvtx(unittest.TestCase):
-
-    def __init__(self, testName, dtype=torch.float16):
-        super().__init__(testName) 
-        self.dtype = dtype
-
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
-
-    def test_conv1d(self):
-        # Data and weight tensors
-        tensor1d_in_conv = torch.randn(32, 3, 224, device='cuda', dtype=self.dtype)
-        tensor1d_in_conv_grouped = torch.randn(32, 6, 224, device='cuda', dtype=self.dtype)
-        conv1d_filter = torch.randn(16, 3, 3, device='cuda', dtype=self.dtype)
-        conv1d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
-        # Vanilla conv1d
-        conv1d_out_vanilla = F.conv1d(tensor1d_in_conv, conv1d_filter)
-        # conv1d with bias
-        conv1d_out_with_bias = F.conv1d(tensor1d_in_conv, conv1d_filter, bias=conv1d_bias)
-        # conv1d - stride > 1
-        conv1d_out_strided = F.conv1d(tensor1d_in_conv, conv1d_filter, stride=2)
-        # conv1d - dilation > 1
-        conv1d_out_dilated = F.conv1d(tensor1d_in_conv, conv1d_filter, dilation=2)
-        # conv1d - groups > 1
-        conv1d_out_grouped = F.conv1d(tensor1d_in_conv_grouped, conv1d_filter, groups=2)
-        # conv1d - padding with zeros
-        conv1d_out_padding_zeros = F.conv1d(tensor1d_in_conv, conv1d_filter, padding=6)
-    
-    def test_conv2d(self):
-        # Data and weight tensors
-        tensor2d_in_conv = torch.randn(32, 3, 224, 224, device='cuda', dtype=self.dtype)
-        tensor2d_in_conv_grouped = torch.randn(32, 6, 224, 224, device='cuda', dtype=self.dtype)
-        conv2d_filter = torch.randn(16, 3, 3, 3, device='cuda', dtype=self.dtype)
-        conv2d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
-        # Vanilla conv2d
-        conv2d_out_vanilla = F.conv2d(tensor2d_in_conv, conv2d_filter)
-        # conv2d with bias
-        conv2d_with_bias = F.conv2d(tensor2d_in_conv, conv2d_filter, bias=conv2d_bias)
-        # conv2d - stride > 1
-        conv2d_out_strided = F.conv2d(tensor2d_in_conv, conv2d_filter, stride=2)
-        # conv2d - dilation > 1
-        conv2d_out_dilated = F.conv2d(tensor2d_in_conv, conv2d_filter, dilation=2)
-        # conv2d - groups > 1
-        conv2d_out_grouped = F.conv2d(tensor2d_in_conv_grouped, conv2d_filter, groups=2)
-        # conv2d - padding with zeros
-        conv2d_out_padding_zeros = F.conv2d(tensor2d_in_conv, conv2d_filter, padding=6)
-    
-    
-    def test_conv3d(self):
-        # Data and weight tensors
-        tensor3d_in_conv = torch.randn(32, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
-        tensor3d_in_conv_grouped = torch.randn(32, 6, 16, 224, 224, device='cuda', dtype=self.dtype)
-        conv3d_filter = torch.randn(16, 3, 3, 3, 3, device='cuda', dtype=self.dtype)
-        conv3d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
-        # Vanilla conv3d
-        conv3d_out_vanilla = F.conv3d(tensor3d_in_conv, conv3d_filter)
-        # conv3d - stride > 1
-        conv3d_out_strided = F.conv3d(tensor3d_in_conv, conv3d_filter, stride=2)
-        # conv3d - dilation > 1
-        conv3d_out_dilated = F.conv3d(tensor3d_in_conv, conv3d_filter, dilation=2)
-        # conv3d - groups > 1
-        conv3d_out_grouped = F.conv3d(tensor3d_in_conv_grouped, conv3d_filter, groups=2)
-        # conv3d - padding with zeros
-        conv3d_out_padding_zeros = F.conv3d(tensor3d_in_conv, conv3d_filter, padding=6)
-    
-    def test_conv_transpose1d(self):
-        # Data and weight tensors
-        conv_transpose1d_tensor = torch.randn(64, 16, 64, device='cuda', dtype=self.dtype)
-        conv_transpose1d_filter = torch.randn(16, 32, 3, device='cuda', dtype=self.dtype)
-        conv_transpose1d_bias = torch.randn(32, device='cuda', dtype=self.dtype)
-        # Conv transpose runs
-        conv_transpose1d_out = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter)
-        conv_transpose1d_out_biased = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, bias=conv_transpose1d_bias)
-        conv_transpose1d_out_strided = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, stride=2)
-        conv_transpose1d_out_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, padding=3)
-        conv_transpose1d_out2_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, output_padding=2, dilation=3)
-        conv_transpose1d_out_grouped = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, groups=2)
-        conv_transpose1d_out_dilated = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, dilation=2)
-    
-    
-    def test_conv_transpose2d(self):
-        # Data and weight tensors
-        conv_transpose2d_tensor = torch.randn(64, 8, 5, 5, device='cuda', dtype=self.dtype)
-        conv_transpose2d_filter = torch.randn(8, 16, 3, 3, device='cuda', dtype=self.dtype)
-        conv_transpose2d_bias = torch.randn(16, device='cuda', dtype=self.dtype)
-        # Conv transpose runs
-        conv_transpose2d_out = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter)
-        conv_transpose2d_out_biased = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, bias=conv_transpose2d_bias)
-        conv_transpose2d_out_strided = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, stride=2)
-        conv_transpose2d_out_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, padding=3)
-        conv_transpose2d_out2_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, output_padding=2, dilation=3)
-        conv_transpose2d_out_grouped = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, groups=2)
-        conv_transpose2d_out_dilated = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, dilation=2)
-    
-    def test_conv_transpose3d(self):
-        # Data and weight tensors
-        conv_transpose3d_tensor = torch.randn(20, 16, 50, 10, 20, device='cuda', dtype=self.dtype)
-        conv_transpose3d_filter = torch.randn(16, 33, 3, 3, 3, device='cuda', dtype=self.dtype)
-        conv_transpose3d_bias = torch.randn(33, device='cuda', dtype=self.dtype)
-        # Conv transpose runs
-        conv_transpose3d_out = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter)
-        conv_transpose3d_out_biased = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, bias=conv_transpose3d_bias)
-        conv_transpose3d_out_strided = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, stride=2)
-        conv_transpose3d_out_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, padding=3)
-        conv_transpose3d_out2_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, output_padding=2, dilation=3)
-        conv_transpose3d_out_grouped = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, groups=2)
-        conv_transpose3d_out_dilated = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, dilation=2)
-    
-    def test_unfold(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        kernel_size = (4, 5)
-        inp_unf_dilated = F.unfold(inp, kernel_size, dilation=2)
-        inp_unf_padded = F.unfold(inp, kernel_size, padding=2)
-        inp_unf_strided = F.unfold(inp, kernel_size, stride=2)
-    
-    def test_fold(self):
-        inp = torch.randn(3, 20, 20, device='cuda', dtype=self.dtype)
-        inp_folded = F.fold(inp, (4, 5), (1, 1))
-    
-    def test_avg_pool1d(self):
-        inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
-        out = F.avg_pool1d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
-    
-    def test_avg_pool2d(self):
-        inp = torch.randn(1, 3, 224, 224, device='cuda', dtype=self.dtype)
-        out = F.avg_pool2d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
-    
-    def test_avg_pool3d(self):
-        inp = torch.randn(1, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
-        out = F.avg_pool3d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
-    
-    def test_adaptive_avg_pool1d(self):
-        inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
-        out = F.adaptive_avg_pool1d(inp, output_size=5) 
-    
-    def test_adaptive_avg_pool2d(self):
-        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
-        out = F.adaptive_avg_pool2d(inp, output_size=5) 
-    
-    def test_adaptive_avg_pool3d(self):
-        inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
-        out = F.adaptive_avg_pool3d(inp, output_size=5) 
-    
-    def test_max_pool1d(self):
-        inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
-        out = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
-    
-    def test_max_pool2d(self):
-        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
-        out = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
-    
-    def test_max_pool3d(self):
-        inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
-        out = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
-    
-    def test_adaptive_max_pool1d(self):
-        inp = torch.randn(1, 16, 28, device='cuda', dtype=self.dtype)
-        out = F.adaptive_max_pool1d(inp, output_size=5, return_indices=True) 
-    
-    def test_adaptive_max_pool2d(self):
-        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
-        out = F.adaptive_max_pool2d(inp, output_size=5, return_indices=True) 
-    
-    def test_adaptive_max_pool3d(self):
-        inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
-        out = F.adaptive_max_pool3d(inp, output_size=5, return_indices=True) 
-    
-    def test_max_unpool1d(self):
-        inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
-        output, indices = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
-        output = F.max_unpool1d(output, indices, kernel_size=2, stride=2, padding=2)
-    
-    def test_max_unpool2d(self):
-        inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
-        output, indices = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
-        output = F.max_unpool2d(output, indices, kernel_size=2, stride=2, padding=2)
-    
-    def test_max_unpool3d(self):
-        inp = torch.randn(1, 16, 8, 32, 32, device='cuda', dtype=self.dtype)
-        output, indices = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
-        output = F.max_unpool3d(output, indices, kernel_size=2, stride=2, padding=2)
-    
-    def test_lp_pool1d(self):
-        inp = torch.randn(1, 32, 64, device='cuda', dtype=self.dtype)
-        output = F.lp_pool1d(inp, 2, 3, stride=2, ceil_mode=True)
-    
-    def test_lp_pool2d(self):
-        #torch.nn.LPPool2d(norm_type, kernel_size, stride=None, ceil_mode=False)
-        inp = torch.randn(1, 32, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.lp_pool2d(inp, 2, 3, stride=2, ceil_mode=True)
-    
-    def test_threshold(self):
-        inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.threshold(inp, 6, 6, inplace=False)
-    
-    def test_threshold_(self):
-        inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.threshold_(inp, 6, 6)
-    
-    def test_relu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.relu(inp, inplace=False)
-    
-    def test_relu_(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.relu_(inp)
-    
-    def test_hardtanh(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.hardtanh(inp, min_val=-1., max_val=1., inplace=False)
-    
-    def test_hardtanh_(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.hardtanh_(inp, min_val=-1., max_val=1.)
-    
-    def test_relu6(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.relu6(inp, inplace=False)
-    
-    def test_elu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.elu(inp, alpha=1.0, inplace=False)
-    
-    def test_elu_(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.elu_(inp, alpha=1.0)
-    
-    def test_selu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.selu(inp)
-    
-    def test_celu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.celu(inp, alpha=1.0, inplace=False)
-    
-    def test_leaky_relu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.leaky_relu(inp, negative_slope=0.01, inplace=False)
-    
-    def test_leaky_relu_(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.leaky_relu_(inp, negative_slope=0.01)
-    
-    def test_prelu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        weight = torch.randn(1, device='cuda', dtype=self.dtype)
-        output = F.prelu(inp, weight)
-    
-    def test_rrelu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.rrelu(inp, lower=1./8, upper=1./3, training=False, inplace=False)
-    
-    def test_rrelu_(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.rrelu(inp, lower=1./8, upper=1./3, training=False)
-    
-    def test_glu(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.glu(inp, dim=-1)
-    
-    def test_logsigmoid(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.logsigmoid(inp)
-    
-    def test_hardshrink(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.hardshrink(inp, lambd=0.5)
-    
-    def test_tanhshrink(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.tanhshrink(inp)
-    
-    def test_softsign(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.softsign(inp)
-    
-    def test_softplus(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.softplus(inp, beta=1, threshold=20)
-    
-    def test_softmin(self):
-        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
-        output = F.softmin(inp, dim=1,  _stacklevel=3, dtype=self.dtype)
-    
-    def test_softmax(self):
-        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
-        output = F.softmax(inp, dim=1, _stacklevel=3, dtype=self.dtype)
-    
-    def test_softshrink(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.softshrink(inp, lambd=0.5)
-    
-    def test_gumbel_softmax(self):
-        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
-        output = F.gumbel_softmax(inp, tau=1, hard=False, eps=1e-10, dim=-1)
-    
-    def test_log_softmax(self):
-        inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
-        output = F.log_softmax(inp, dim=-1, _stacklevel=3)
-    
-    def test_tanh(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = torch.tanh(inp)
-    
-    def test_sigmoid(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = torch.sigmoid(inp)
-    
-    def test_batch_norm(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        # running_mean, running_var
-        running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
-        running_var = torch.randn(3, device='cuda', dtype=self.dtype)
-        output = F.batch_norm(inp, running_mean, running_var, weight=None, bias=None, training=False, momentum=0.1, eps=1e-05)
-    
-    def test_instance_norm(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
-        running_var = torch.randn(3, device='cuda', dtype=self.dtype)
-        output = F.instance_norm(inp, running_mean=running_mean, running_var=running_var, weight=None, bias=None, use_input_stats=True, momentum=0.1, eps=1e-05)
-    
-    def test_layer_norm(self):
-        inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
-        output = F.layer_norm(inp, inp.size()[1:], weight=None, bias=None, eps=1e-05)
-    
-    def test_local_response_norm(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.local_response_norm(inp, 2, alpha=0.0001, beta=0.75, k=1.0)
-    
-    def test_normalize(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.normalize(inp, p=2, dim=1, eps=1e-12, out=None)
-    
-    def test_linear(self):
-        inp = torch.randn(32, 64, 128, device='cuda', dtype=self.dtype)
-        weight = torch.randn(256, 128, device='cuda', dtype=self.dtype)
-        output = F.linear(inp, weight, bias=None)
-    
-    def test_dropout(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.dropout(inp, p=0.5, training=True, inplace=False)
-    
-    def test_alpha_dropout(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.alpha_dropout(inp, p=0.5, training=True, inplace=False)
-    
-    def test_dropout2d(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.dropout2d(inp, p=0.5, training=True, inplace=False)
-    
-    def test_dropout3d(self):
-        inp = torch.randn(16, 8, 32, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.dropout3d(inp, p=0.5, training=True, inplace=False)
-    
-    def test_embedding(self):
-        pre_embed_dim = 1024
-        post_embed_dim = 32
-        inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')    
-        weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
-        output = F.embedding(inp, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False)
-    
-    def test_embedding_bag(self):
-        pre_embed_dim = 1024
-        post_embed_dim = 32
-        inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')    
-        weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
-        output = F.embedding_bag(inp, weight, offsets=None, max_norm=None, norm_type=2,
-            scale_grad_by_freq=False, mode='mean', sparse=False)
-    
-    def test_one_hot(self):
-        num_classes = 10
-        inp = torch.randint(0, num_classes, (128, 16), device='cuda')    
-        output = F.one_hot(inp, num_classes=10) 
-    
-    def test_pairwise_distance(self):
-        inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
-        inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
-        output = F.pairwise_distance(inp1, inp2, p=2.0, eps=1e-06, keepdim=False) 
-    
-    def test_cosine_similarity(self):
-        inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
-        inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
-        output = F.cosine_similarity(inp1, inp2, dim=1, eps=1e-8)
-    
-    def test_pdist(self):
-        # pdist is not implemented for fp16
-        inp = torch.randn(128, 128, device='cuda', dtype=torch.float32)
-        output = F.pdist(inp, p=2)
-    
-    def test_binary_cross_entropy(self):
-        # binary_cross_entropy is not implemented for fp16
-        inp = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=True)
-        target = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=False)
-        output = F.binary_cross_entropy(torch.sigmoid(inp), target)
-    
-    def test_binary_cross_entropy_with_logits(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.empty_like(inp).random_(2)
-        output = F.binary_cross_entropy_with_logits(inp, target)
-    
-    def test_poisson_nll_loss(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
-        output = F.poisson_nll_loss(inp, target, log_input=True, full=False,
-            size_average=None, eps=1e-08, reduce=None, reduction='mean')
-    
-    def test_cosine_embedding_loss(self):
-        inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randn(32, device='cuda', dtype=self.dtype, requires_grad=False)
-        output = F.cosine_embedding_loss(inp1, inp2, target, margin=0,
-            size_average=None, reduce=None, reduction='mean')
-    
-    def test_cross_entropy(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randint(0, 100, (32,), device='cuda', dtype=torch.long, requires_grad=False)
-        output = F.cross_entropy(inp, target, weight=None, size_average=None,
-            ignore_index=-100, reduce=None, reduction='mean')
-    
-    def test_ctc_loss(self):
-        # force fp32 because _th_normal_ (used by next line is not supported for fp16)
-        log_probs = torch.randn(50, 16, 20, device='cuda', dtype=torch.float32).log_softmax(2).detach().requires_grad_()
-        targets = torch.randint(1, 20, (16, 30), device='cuda', dtype=torch.long)
-        input_lengths = torch.full((16,), 50, dtype=torch.long)
-        target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
-        loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
-    
-    def test_hinge_embedding_loss(self):
-        inp = torch.randn(128, 32, device='cuda', dtype=self.dtype)
-        target = torch.randint(0, 1, (32,), device='cuda') - 1
-        output = F.hinge_embedding_loss(inp, target, margin=1.0, size_average=None, reduce=None, reduction='mean') 
-    
-    def test_kl_div(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        output = F.kl_div(inp, target, size_average=None, reduce=None, reduction='batchmean')
-    
-    def test_mse_loss(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        output = F.mse_loss(inp, target, size_average=None, reduce=None, reduction='mean')
-    
-    def test_margin_ranking_loss(self):
-        inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = (torch.randint(0, 1, (128,), device='cuda') - 1).type_as(inp1)
-        output = F.margin_ranking_loss(inp1, inp2, target, margin=0, size_average=None, reduce=None, reduction='mean')
-    
-    def test_multilabel_margin_loss(self):
-        inp = torch.randn(1024, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randint(0, 10, (1024,), dtype=torch.long, device='cuda')
-        output = F.multilabel_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
-    
-    def test_nll_loss(self):
-        inp = torch.randn(64, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randint(0, 10, (64,), device='cuda', dtype=torch.long) 
-        output = F.nll_loss(inp, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')
-    
-    def test_smooth_l1_loss(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
-        output = F.smooth_l1_loss(inp, target, size_average=None, reduce=None, reduction='mean')
-    
-    def test_soft_margin_loss(self):
-        inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
-        output = F.soft_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean') 
-    
-    def test_triplet_margin_loss(self):
-        inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        inp3 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
-        output = F.triplet_margin_loss(inp1, inp2, inp3, margin=1.0, p=2,
-             eps=1e-06, swap=False, size_average=None, reduce=None, reduction='mean')
-    
-    def test_pixel_shuffle(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = torch.nn.functional.pixel_shuffle(inp, 2)
-    
-    def test_pad(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        pad = (3, 3)
-        output = F.pad(inp, pad, mode='constant', value=0)
-    
-    def test_interpolate(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        output = F.interpolate(inp, size=None, scale_factor=2, mode='nearest', align_corners=None)
-    
-    def test_grid_sample(self):
-        inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
-        grid = torch.randn(16, 32, 32, 2, device='cuda', dtype=self.dtype)
-        output = F.grid_sample(inp, grid, mode='bilinear', padding_mode='zeros')
-    
-    def test_affine_grid(self):
-        theta = torch.randn(32, 2, 3, device='cuda', dtype=self.dtype)
-        size = (32, 8, 32, 32)
-        output = F.affine_grid(theta, size)
-
-
-def run_tests(precision):
-    dummy = TestPyProfNvtx('test_affine_grid', None)
-    test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
-    print("Running tests for {}".format(precision))
-    suite = unittest.TestSuite()
-    for test_case in test_cases:
-        suite.addTest(TestPyProfNvtx(test_case, precision))
-    unittest.TextTestRunner().run(suite)
-
-if __name__ == '__main__':
-    run_tests(torch.float32)
-    run_tests(torch.float16)
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
@@ -20,8 +20,6 @@ TEST_DIRS = [
    "run_fp16util",
    "run_optimizers",
    "run_fused_layer_norm",
-    "run_pyprof_nvtx",
-    "run_pyprof_data",
    "run_mlp",
    "run_transformer",
 ]