Remove `pyprof` and `reparameterization` (#1404)

* remove pyprof * remove reparameterization * remove pyprof test * clean up

Remove `pyprof` and `reparameterization` (#1404)
* remove pyprof * remove reparameterization * remove pyprof test * clean up
8a7a3325 · Masaki Kozuki · GitHub · cd499737 · cd499737 · cd499737
Unverified Commit 8a7a3325 authored Jul 06, 2022 by Masaki Kozuki Committed by GitHub Jul 06, 2022
20 changed files
--- a/apex/pyprof/examples/jit/jit_trace_method.py
+++ b/apex/pyprof/examples/jit/jit_trace_method.py
-#!/usr/bin/env python3
-
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-
-class Foo(torch.nn.Module):
-    def __init__(self, size):
-        super(Foo, self).__init__()
-        self.n = torch.nn.Parameter(torch.ones(size))
-        self.m = torch.nn.Parameter(torch.ones(size))
-
-    def forward(self, input):
-        return self.n*input + self.m
-
-foo = Foo(4)
-foo.cuda()
-x = torch.ones(4).cuda()
-
-#JIT the class using tracing
-traced_foo = torch.jit.trace(foo, x)
-
-#Initialize pyprof after the JIT step
-pyprof.nvtx.init()
-
-#Assign a name to the object "traced_foo"
-traced_foo.__dict__['__name__'] = "foo"
-
-#Hook up the forward function to pyprof
-pyprof.nvtx.wrap(traced_foo, 'forward')
-
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = traced_foo(x)
-	profiler.stop()
-	print(z)
--- a/apex/pyprof/examples/jit/test.sh
+++ b/apex/pyprof/examples/jit/test.sh
-#!/bin/bash
-
-set -e
-
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-
-parse="python $PYPROF/parse/parse.py"
-prof="python $PYPROF/prof/prof.py"
-
-for f in *.py
-do
-	base=`basename $f .py`
-	sql=$base.sql
-	dict=$base.dict
-
-	#NVprof
-	echo "nvprof -fo $sql python $f"
-	nvprof -fo $sql python $f
-
-	#Parse
-	echo $parse $sql
-	$parse $sql > $dict
-
-	#Prof
-	echo $prof $dict
-	$prof -w 130 $dict
-	\rm $sql $dict
-done
--- a/apex/pyprof/examples/lenet.py
+++ b/apex/pyprof/examples/lenet.py
-#!/usr/bin/env python3
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.cuda.profiler as profiler
-import torch.optim as optim
-
-from apex import pyprof
-pyprof.nvtx.init()
-
-class LeNet5(nn.Module):
-	def __init__(self):
-		super(LeNet5, self).__init__()
-		# 1 input image channel, 6 output channels, 5x5 square convolution
-		# kernel
-		self.conv1 = nn.Conv2d(1, 6, 5)
-		self.conv2 = nn.Conv2d(6, 16, 5)
-		# an affine operation: y = Wx + b
-		self.fc1 = nn.Linear(16 * 5 * 5, 120)
-		self.fc2 = nn.Linear(120, 84)
-		self.fc3 = nn.Linear(84, 10)
-
-	def forward(self, x):
-		# Max pooling over a (2, 2) window
-		x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
-		# If the size is a square you can only specify a single number
-		x = F.max_pool2d(F.relu(self.conv2(x)), 2)
-		x = x.view(-1, self.num_flat_features(x))
-		x = F.relu(self.fc1(x))
-		x = F.relu(self.fc2(x))
-		x = self.fc3(x)
-		return x
-
-	def num_flat_features(self, x):
-		size = x.size()[1:]  # all dimensions except the batch dimension
-		num_features = 1
-		for s in size:
-			num_features *= s
-		return num_features
-
-with torch.autograd.profiler.emit_nvtx():
-
-	net = LeNet5().cuda()
-
-	input = torch.randn(1, 1, 32, 32).cuda()
-	out = net(input)
-
-	target = torch.randn(10)			# a dummy target, for example
-	target = target.view(1, -1).cuda()	# make it the same shape as output
-	criterion = nn.MSELoss()
-
-	# create your optimizer
-	optimizer = optim.SGD(net.parameters(), lr=0.01)
-
-	# in your training loop:
-	optimizer.zero_grad()	# zero the gradient buffers
-
-	profiler.start()
-	output = net(input)
-	loss = criterion(output, target)
-	loss.backward()
-	optimizer.step()	# Does the update
-	profiler.stop()
-
--- a/apex/pyprof/examples/operators.py
+++ b/apex/pyprof/examples/operators.py
-#!/usr/bin/env python3
-
-"""
-This file checks all Python operators.
-"""
-
-import sys
-import torch
-import torch.cuda.profiler as profiler
-import operator
-import inspect
-
-#Import and initialize pyprof
-from apex import pyprof
-pyprof.nvtx.init()
-
-X = 1024
-Y = 1024
-
-fa = torch.rand(X, Y).cuda()
-fb = torch.rand(X, Y).cuda()
-fc = torch.rand(X, Y).cuda()
-
-ia = torch.randint(0, 100, (X, Y)).cuda()
-ib = torch.randint(0, 100, (X, Y)).cuda()
-
-sa = torch.ones(1,1).cuda()
-sb = torch.ones(1,1).cuda()
-
-ba = fa.byte()
-
-unaryOps = ["abs", "__abs__", "neg", "__neg__",]
-invertOps = ["inv", "invert", "__inv__", "__invert__",]	#imlemented only for byte tensors
-#pos, __pos__ is not implemented for tensors
-
-binaryOps = []
-binaryOps += [ "lt", "__lt__", "le", "__le__", "eq", "__eq__", "ne", "__ne__", "ge", "__ge__", "gt", "__gt__" ]
-binaryOps += [ "add", "__add__", "sub", "__sub__", "mul", "__mul__", "floordiv", "__floordiv__", "truediv", "__truediv__", "pow", "__pow__", "mod", "__mod__"]
-binaryOps += [ "and_", "__and__", "or_", "__or__", "xor", "__xor__", "lshift", "__lshift__", "rshift", "__rshift__"]
-
-inplaceOps = []
-inplaceOps += ["iadd", "__iadd__", "isub", "__isub__", "imul", "__imul__", "ifloordiv", "__ifloordiv__", "itruediv", "__itruediv__", "imod", "__imod__",]
-#ipow, __ipow__ is not implemented in pytorch
-inplaceOps += [ "iand", "__iand__", "ior", "__ior__", "ixor", "__ixor__", "ilshift", "__ilshift__", "irshift", "__irshift__",]
-
-matmulOps = [ "matmul", "__matmul__" ]
-inplacematmulOps = [ "imatmul", "__imatmul__" ]
-
-reverseIntBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rfloordiv__", "__rpow__",]
-reverseFloatBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__",]
-
-'''
-TODO
-.concat(a, b)
-.__concat__(a, b)
-.contains(a, b)
-.__contains__(a, b)
-.countOf(a, b)
-.delitem(a, b)
-.__delitem__(a, b)
-.getitem(a, b)
-.__getitem__(a, b)
-.indexOf(a, b)
-.setitem(a, b, c)
-.__setitem__(a, b, c)
-.length_hint(obj, default=0)
-.iconcat(a, b)
-.__iconcat__(a, b)
-.index(a)
-.__index__(a)
-'''
-
-#Context manager
-with torch.autograd.profiler.emit_nvtx():
-
-	#Start profiler
-	profiler.start()
-
-	for op in unaryOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(ia)
-
-	for op in invertOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(ba)
-
-	for op in binaryOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(ia, ib)
-		c = f(ia, 2)
-
-	for op in inplaceOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		ia = f(ia, ib)
-		ia = f(ia, 2)
-
-	for op in matmulOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(fa, fb)
-
-	for op in inplacematmulOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		fa = f(fa, fb)
-
-	for op in reverseIntBinaryOps:
-		assert hasattr(torch.Tensor, op)
-		f = getattr(torch.Tensor, op)
-		ia = f(ia, ib)
-
-	for op in reverseFloatBinaryOps:
-		assert hasattr(torch.Tensor, op)
-		f = getattr(torch.Tensor, op)
-		fa = f(fa, fb)
-
-	'''
-	#c = fa[3]
-	#c = fa[3][3]
-	#c = torch.min(fa, 3)
-	c = torch.sum(fa)
-	c = torch.max(fa)
-	c = -fa
-	#fc[2][2] = fa[2][2]
-
-	c = a_scalar and b_scalar
-	c = a_scalar or b_scalar
-	c = not a_scalar
-
-	c = a is b
-	c = a is not b
-	'''
-
-	#Stop profiler
-	profiler.stop()
--- a/apex/pyprof/examples/simple.py
+++ b/apex/pyprof/examples/simple.py
-#!/usr/bin/env python3
-
-"""
-This simple file provides an example of how to
- - import the pyprof library and initialize it
- - use the emit_nvtx context manager
- - start and stop the profiler
-
-Only kernels within profiler.start and profiler.stop calls are profiled.
-To profile
-$ nvprof -f -o simple.sql --profile-from-start off ./simple.py
-"""
-
-import sys
-import torch
-import torch.cuda.profiler as profiler
-
-#Import and initialize pyprof
-from apex import pyprof
-pyprof.nvtx.init()
-
-a = torch.randn(5, 5).cuda()
-b = torch.randn(5, 5).cuda()
-
-#Context manager
-with torch.autograd.profiler.emit_nvtx():
-
-	#Start profiler
-	profiler.start()
-
-	c = a + b
-	c = torch.mul(a,b)
-	c = torch.matmul(a,b)
-	c = torch.argmax(a, dim=1)
-	c = torch.nn.functional.pad(a, (1,1))
-
-	#Stop profiler
-	profiler.stop()
--- a/apex/pyprof/examples/user_annotation/README.md
+++ b/apex/pyprof/examples/user_annotation/README.md
-Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm) 
-are a useful tool to capture and observe events and code ranges etc. 
-Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP).
-
-While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option.
-
-NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel.
-
-The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks.
-
-Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all.
-
-### To run
-
-```sh
-nvprof -fo resnet.sql --profile-from-start off python resnet.py
-parse.py resnet.sql > resnet.dict
-prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict
-```
-
-The file `resnet.sql` can also be opened with NVVP as usual.
--- a/apex/pyprof/examples/user_annotation/resnet.py
+++ b/apex/pyprof/examples/user_annotation/resnet.py
-#!/usr/bin/env python3
-
-"""
-An example showing use of nested NVTX markers.
-"""
-
-import torch
-import torch.nn as nn
-
-import torch.cuda.profiler as profiler
-import torch.cuda.nvtx as nvtx
-from apex import pyprof
-pyprof.nvtx.init()
-
-def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
-	"""3x3 convolution with padding"""
-	return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-					 padding=dilation, groups=groups, bias=False, dilation=dilation)
-
-def conv1x1(in_planes, out_planes, stride=1):
-	"""1x1 convolution"""
-	return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-class Bottleneck(nn.Module):
-	expansion = 4
-	count = 1
-
-	def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-				 base_width=64, dilation=1, norm_layer=None):
-		super(Bottleneck, self).__init__()
-		if norm_layer is None:
-			norm_layer = nn.BatchNorm2d
-		width = int(planes * (base_width / 64.)) * groups
-		# Both self.conv2 and self.downsample layers downsample the input when stride != 1
-		self.conv1 = conv1x1(inplanes, width)
-		self.bn1 = norm_layer(width)
-		self.conv2 = conv3x3(width, width, stride, groups, dilation)
-		self.bn2 = norm_layer(width)
-		self.conv3 = conv1x1(width, planes * self.expansion)
-		self.bn3 = norm_layer(planes * self.expansion)
-		self.relu = nn.ReLU(inplace=True)
-		self.downsample = downsample
-		self.stride = stride
-
-		self.id = Bottleneck.count
-		Bottleneck.count += 1
-
-	def forward(self, x):
-		identity = x
-
-		nvtx.range_push("layer:Bottleneck_{}".format(self.id))
-
-		nvtx.range_push("layer:Conv1")
-		out = self.conv1(x)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:BN1")
-		out = self.bn1(out)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:ReLU")
-		out = self.relu(out)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:Conv2")
-		out = self.conv2(out)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:BN2")
-		out = self.bn2(out)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:ReLU")
-		out = self.relu(out)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:Conv3")
-		out = self.conv3(out)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:BN3")
-		out = self.bn3(out)
-		nvtx.range_pop()
-
-		if self.downsample is not None:
-			nvtx.range_push("layer:Downsample")
-			identity = self.downsample(x)
-			nvtx.range_pop()
-
-		nvtx.range_push("layer:Residual")
-		out += identity
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:ReLU")
-		out = self.relu(out)
-		nvtx.range_pop()
-
-		nvtx.range_pop()
-
-		return out
-
-class ResNet(nn.Module):
-
-	def __init__(self, block, layers, num_classes=1000,
-				 groups=1, width_per_group=64, norm_layer=None):
-		super(ResNet, self).__init__()
-		if norm_layer is None:
-			norm_layer = nn.BatchNorm2d
-		self._norm_layer = norm_layer
-
-		self.inplanes = 64
-		self.dilation = 1
-
-		self.groups = groups
-		self.base_width = width_per_group
-		self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
-		self.bn1 = norm_layer(self.inplanes)
-		self.relu = nn.ReLU(inplace=True)
-		self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-		self.layer1 = self._make_layer(block, 64, layers[0])
-		self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-		self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-		self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-		self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-		self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-		for m in self.modules():
-			if isinstance(m, nn.Conv2d):
-				nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-			elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-				nn.init.constant_(m.weight, 1)
-				nn.init.constant_(m.bias, 0)
-
-	def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
-		norm_layer = self._norm_layer
-		downsample = None
-		previous_dilation = self.dilation
-		if dilate:
-			self.dilation *= stride
-			stride = 1
-		if stride != 1 or self.inplanes != planes * block.expansion:
-			downsample = nn.Sequential(
-				conv1x1(self.inplanes, planes * block.expansion, stride),
-				norm_layer(planes * block.expansion),
-			)
-
-		layers = []
-		layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-							self.base_width, previous_dilation, norm_layer))
-		self.inplanes = planes * block.expansion
-		for _ in range(1, blocks):
-			layers.append(block(self.inplanes, planes, groups=self.groups,
-								base_width=self.base_width, dilation=self.dilation,
-								norm_layer=norm_layer))
-
-		return nn.Sequential(*layers)
-
-	def forward(self, x):
-
-		nvtx.range_push("layer:conv1_x")
-		x = self.conv1(x)
-		x = self.bn1(x)
-		x = self.relu(x)
-		x = self.maxpool(x)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:conv2_x")
-		x = self.layer1(x)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:conv3_x")
-		x = self.layer2(x)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:conv4_x")
-		x = self.layer3(x)
-		nvtx.range_pop()
-
-		nvtx.range_push("layer:conv5_x")
-		x = self.layer4(x)
-		nvtx.range_pop()
-
-		x = self.avgpool(x)
-		x = torch.flatten(x, 1)
-
-		nvtx.range_push("layer:FC")
-		x = self.fc(x)
-		nvtx.range_pop()
-
-		return x
-
-
-def resnet50():
-	return ResNet(Bottleneck, [3, 4, 6, 3])
-
-#Create model
-net = resnet50().cuda().half()
-net.train()
-
-#Create optimizer
-criterion = nn.CrossEntropyLoss().cuda()
-optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
-
-#Create synthetic input and label
-x = torch.rand(32, 3, 224, 224).cuda().half()
-target = torch.empty(32, dtype=torch.long).random_(1000).cuda()
-
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	output = net(x)
-	loss = criterion(output, target)
-	optimizer.zero_grad()
-	loss.backward()
-	optimizer.step()
-	profiler.stop()
--- a/apex/pyprof/examples/user_annotation/test.sh
+++ b/apex/pyprof/examples/user_annotation/test.sh
-#!/bin/bash
-
-set -e
-
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-
-parse="python $PYPROF/parse/parse.py"
-prof="python $PYPROF/prof/prof.py"
-
-for f in *.py
-do
-	base=`basename $f .py`
-	sql=$base.sql
-	dict=$base.dict
-
-	#NVprof
-	echo "nvprof -fo --profile-from-start off $sql python $f"
-	nvprof -fo $sql --profile-from-start off python $f
-
-	#Parse
-	echo $parse $sql
-	$parse $sql > $dict
-
-	#Prof
-	echo $prof $dict
-	#$prof -w 130 $dict
-	$prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict
-	\rm $sql $dict
-done
--- a/apex/pyprof/nvtx/__init__.py
+++ b/apex/pyprof/nvtx/__init__.py
-from .nvmarker import init
-from .nvmarker import add_wrapper as wrap
--- a/apex/pyprof/nvtx/nvmarker.py
+++ b/apex/pyprof/nvtx/nvmarker.py
-"""
-This file intercepts (monkey patches) the following functions and adds NVTX markers.
-	torch.*
-	torch.Tensor.*
-	torch.nn.functional.*
-	torch.nn.*.forward
-
-The NVTX markers (one or more) contain the following information
-	call trace (a list of file_name:line_number)
-	extra_repr() from torch.nn modules
-	module/class name
-	function name
-	inputs (args and kwargs)
-		scalar: name, type and value
-		tensor: name, shape and datatype
-		numpy: name, shape and datatype
-		list/tuple: a sequence of scalars or tensors or numpy arrays
-"""
-
-import torch
-import torch.cuda.nvtx as nvtx
-import numpy
-import inspect as ins
-import traceback
-import math
-
-def isfunc(mod, f):
-	assert hasattr(mod, f)
-	attr = getattr(mod, f)
-
-	#Ignore functions like _add
-	if (len(f) >= 2):
-		if f[0] == "_" and f[1] != "_":
-			return False
-
-	#Ignore functions from this list
-	ignore = ['__all__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__builtins__', '__cached__', '__class__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__file__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__index__', '__init__', '__init_subclass__', '__iter__', '__len__', '__loader__', '__module__', '__name__', '__new__', '__nonzero__', '__package__', '__path__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__spec__', '__str__', '__subclasshook__', '__version__', '__weakref__']
-
-	#Add functions to this list if they cause recursion
-	ignore += ['size', 'tolist', 'dim', 'is_storage', 'item']
-	if f in ignore:
-		return False
-
-	return ins.ismethod(attr) or ins.isfunction(attr) or ins.ismethoddescriptor(attr) or ins.isbuiltin(attr)
-
-def traceMarker(stack):
-	d = {}
-	cadena = []
-	for i in range(len(stack)-1):
-		fi = stack[i]
-		t = "{}:{}".format(fi.filename, fi.lineno)
-		cadena.append(t)
-	d['traceMarker'] = cadena
-	return str(d)
-
-def modMarker(mod, fn_name, args):
-	"""
-	Returns the stringified extra_repr() of a module.
-	"""
-	assert(fn_name == 'forward')
-	assert(len(args) > 0)
-	d = {}
-	d['mod'] = mod.__name__
-	d['strRepr'] = args[0].extra_repr()
-	return str(d)
-
-def add_wrapper(mod, fn_name):
-	assert isfunc(mod, fn_name)
-
-	# Get a pointer to the original function
-	func = getattr(mod, fn_name)
-
-	# Check if the mod has a string representation
-	# and is not a Script or Traced module (used by JIT)
-	s = hasattr(mod, "extra_repr") and (type(mod) is not torch.jit.ScriptModule) and (type(mod) is not torch.jit.TopLevelTracedModule)
-
-	def wrapper_func(*args, **kwargs):
-
-		# Extract the stacktrace
-		stack = traceback.extract_stack()
-
-		# Push trace marker
-		nvtx.range_push(traceMarker(stack))
-
-		# Push module marker
-		if s:
-			m = modMarker(mod, fn_name, args)
-			nvtx.range_push(m)
-
-		# Create and push argument marker
-		cadena = argMarker(mod, fn_name, args, kwargs)
-		nvtx.range_push(cadena)
-
-		# Call the original function
-		result = func(*args, **kwargs)
-
-		# Pop argumet marker
-		nvtx.range_pop()
-
-		# Pop module marker
-		if s:
-			nvtx.range_pop()
-
-		# Pop trace marker
-		nvtx.range_pop()
-
-		return result
-	setattr(mod, fn_name, wrapper_func)
-
-def argMarker(mod, op, args, kwargs):
-	#For this function args is a tuple and kwargs is a dict
-
-	def tensor(arg, name=""):
-		a = {}
-		a['name'] = name
-		a['type'] = "tensor"
-		a['shape'] = tuple(arg.size())
-		a['dtype'] = str(arg.dtype).split(".")[-1]
-		cadena['args'].append(a)
-
-	def ndarray(arg, name=""):
-		a = {}
-		a['name'] = name
-		a['type'] = "ndarray"
-		a['shape'] = arg.shape
-		a['dtype'] = str(arg.dtype).split(".")[-1]
-		cadena['args'].append(a)
-
-	def seq(arg, name=""):
-		assert issequence(arg)
-		a = {}
-		a['name'] = name
-		if isinstance(arg, list):
-			a['type'] = "list"
-			a['value'] = arg
-		else:
-			a['type'] = "tuple"
-			# The arg could be torch.Size, which is a subclass of tuple
-			# Therefore, explicitly convert to tuple
-			a['value'] = tuple(arg)
-		
-		cadena['args'].append(a)
-
-	def scalar(arg, name=""):
-		a = {}
-		a['name'] = name
-		a['type'] = type(arg).__name__
-		#handle the case when the argument is +/- inf or nan
-		if arg == float('inf'):
-			a['value'] = "inf"
-		elif arg == float('-inf'):
-			a['value'] = "-inf"
-		elif isinstance(arg, float) and math.isnan(arg):
-			a['value'] = "nan"
-		else:
-			a['value'] = arg
-		cadena['args'].append(a)
-
-	def isscalar(arg):
-		return (type(arg) is int) or (type(arg) is float) or (type(arg) is bool) or (arg is None) or (type(arg) is str)
-
-	def issequence(arg):
-		return isinstance(arg, list) or isinstance(arg, tuple)
-
-	def foo(args, name):
-		#args should be an iterable sequence e.g. list or tuple
-		for arg in args:
-			if isinstance(arg, torch.Tensor):
-				if arg.dim() == 0:
-					scalar(arg.item(), name)
-				else:
-					tensor(arg, name)
-			elif isinstance(arg, numpy.ndarray):
-				ndarray(arg, name)
-			elif (isscalar(arg)):
-				scalar(arg, name)
-			elif issequence(arg):
-				if (len(arg) == 0) or isscalar(arg[0]):	#An empty sequence or a sequence of scalars
-					seq(arg, name)
-				else:	# A sequence of tensors or numpy arrays
-					foo(arg, name)
-			'''
-			else:
-				print("The following arg is none of Tensor, numpy array, scalar but a %s" % (str(type(arg))))
-				print("Mod: %s" % str(mod.__name__))
-				print("Op: %s" % str(op))
-				print(dir(arg))
-			'''
-
-	cadena = {}
-	cadena['mod'] = mod.__name__
-	cadena['op'] = op
-	cadena['args'] = []
-
-	foo(args, "")
-	for k,v in kwargs.items():
-		foo((v,), k)
-
-	return str(cadena)
-
-def patchClass(cls):
-	for f in dir(cls):
-		if isfunc(cls, f):
-			add_wrapper(cls, f)
-
-def init():
-	string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
-			"soon be removed from Apex.  Please visit\n" + \
-			"https://github.com/NVIDIA/PyProf\n" + \
-			"for the latest version.\n\n"
-	# print regardless of warning state
-	print(string)
-
-	print("Initializing NVTX monkey patches")
-	for cls in [torch, torch.Tensor, torch.nn.functional,]:
-		patchClass(cls)
-
-	for cls in [torch.nn.RNN, torch.nn.RNNCell, torch.nn.LSTM, torch.nn.LSTMCell, torch.nn.GRU, torch.nn.GRUCell]:
-		if isfunc(cls, 'forward'):
-			add_wrapper(cls, 'forward')
-
-	print("Done with NVTX monkey patching")
--- a/apex/pyprof/parse/__init__.py
+++ b/apex/pyprof/parse/__init__.py
--- a/apex/pyprof/parse/__main__.py
+++ b/apex/pyprof/parse/__main__.py
-import warnings
-
-try:
-    from .parse import main
-except ImportError as e:
-    warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)")
-    raise e
-
-if __name__ == '__main__':
-    main()
--- a/apex/pyprof/parse/db.py
+++ b/apex/pyprof/parse/db.py
-import sys, sqlite3
-
-class DB(object):
-	"""
-	This class provides functions for DB operations
-	with exception handling.
-	"""
-
-	def __init__(self, dbFile):
-		try:
-			conn = sqlite3.connect(dbFile)
-			conn.row_factory = sqlite3.Row
-			c = conn.cursor()
-		except:
-			print("Error opening {}".format(dbFile))
-			sys.exit(1)
-
-		self.conn = conn
-		self.c = c
-
-	def select(self, cmd):
-		try:
-			self.c.execute(cmd)
-			#rows = self.c.fetchall()
-			rows = [dict(row) for row in self.c.fetchall()]
-		except sqlite3.Error as e:
-			print(e)
-			sys.exit(1)
-		except:
-			print("Uncaught error in SQLite access while executing {}".format(cmd))
-			sys.exit(1)
-
-		#print(rows)
-		return rows
-
-	def insert(self, cmd, data):
-		try:
-			self.c.execute(cmd, data)
-		except sqlite3.Error as e:
-			print(e)
-			sys.exit(1)
-		except:
-			print("Uncaught error in SQLite access while executing {}".format(cmd))
-			sys.exit(1)
-
-	def execute(self, cmd):
-		try:
-			self.c.execute(cmd)
-		except sqlite3.Error as e:
-			print(e)
-			sys.exit(1)
-		except:
-			print("Uncaught error in SQLite access while executing {}".format(cmd))
-			sys.exit(1)
-
-	def commit(self):
-		self.conn.commit()
-
-	def close(self):
-		self.c.close()
-		self.conn.close()
--- a/apex/pyprof/parse/kernel.py
+++ b/apex/pyprof/parse/kernel.py
-import cxxfilt, struct, binascii
-
-#Helper functions
-
-def demangle(name):
-	"""
-	Demangle a C++ string
-	"""
-	return cxxfilt.demangle(name)
-
-def encode_object_id(pid, tid):
-	"""
-	Given process id (pid) and thread id (tid), return the object id.
-	object id = pid (little endian 4 bytes) + tid (little endian 8 bytes)
-	"""
-	objId = struct.pack('<i', pid) + struct.pack('<q',tid)
-	objId = binascii.hexlify(objId).decode('ascii').upper()
-	return objId
-
-def getShortName(name):
-	"""
-	Returns a shorter kernel name
-	"""
-	sname = name.split("<")[0] \
-				.replace("void ", "") \
-				.replace("at::","") \
-				.replace("cuda::", "") \
-				.replace("native::","") \
-				.replace("(anonymous namespace)::", "")
-	sname = sname.split("(")[0]
-	return sname
-
-class Kernel(object):
-	"""
-	This class stores information about a kernel.
-	"""
-
-	kernels = []
-	profStart = 0
-
-	def __init__(self):
-		self.kNameId = None
-		self.kShortName = None
-		self.kLongName = None
-		self.kStartTime = None	#GPU start time
-		self.kEndTime = None	#GPU end time
-		self.kDuration = None
-		self.device = None
-		self.stream = None
-		self.grid = ()
-		self.block = ()
-		self.corrId = None
-		self.rStartTime = None	#CPU start time
-		self.rEndTime = None	#CPU end time
-		self.rDuration = None
-		self.tid = None
-		self.pid = None
-		self.objId = None
-		self.timeOffset = None
-
-		self.layerMarkers = []
-		self.traceMarkers = []
-		self.reprMarkers = []
-		self.pyprofMarkers = []
-		self.seqMarkers = []
-		self.otherMarkers = []
-		self.altMarkers = []
-		self.seqId = []
-		self.altSeqId = []
-		self.layer = []
-
-		self.subSeqId = None
-		self.dir = None
-		self.mod = []
-		self.op = []
-
-	def setKernelInfo(self, info):
-		self.kNameId = info['name']
-		self.corrId = int(info['correlationId'])
-		start = int(info['start'])
-		end = int(info['end'])
-		assert end > start, "This assertion can fail for very large profiles. It usually fails when start = end = 0."
-		self.kStartTime = start
-		self.kEndTime = end
-		self.kDuration = end - start
-		assert (start > Kernel.profStart)
-		self.device = int(info['deviceId'])
-		self.stream = int(info['streamId'])
-		self.grid = (info['gridX'], info['gridY'], info['gridZ'])
-		self.block = (info['blockX'], info['blockY'], info['blockZ'])
-		self.timeOffset = Kernel.profStart
-
-	def setKernelName(self, name):
-		cadena = demangle(name)
-		self.kLongName = cadena
-		self.kShortName = getShortName(cadena)
-
-	def setRunTimeInfo(self, info):
-		start, end, pid, tid = info
-		self.rStartTime = start
-		self.rEndTime = end
-		self.rDuration = end - start
-		self.pid = pid
-		self.tid = tid
-		self.objId = encode_object_id(pid, tid)
-
-	def setMarkerInfo(self, info):
-		self.layerMarkers, self.traceMarkers, self.reprMarkers, self.pyprofMarkers, self.seqMarkers, self.otherMarkers, self.altMarkers, self.seqId, self.altSeqId, self.layer = info
-		self.subSeqId = 0
-
-	def setDirection(self):
-		"""
-		Set direction (fprop, bprop) based on PyTorch sequence markers.
-		It is a heuristic and not a foolproof method.
-		"""
-		if	any("Backward, seq = " in x for x in self.seqMarkers) or \
-			any("backward, seq = " in x for x in self.seqMarkers) or \
-			any("Backward0, seq = " in x for x in self.seqMarkers):
-			self.dir = "bprop"
-		else:
-			self.dir = "fprop"
-
-	def setOp(self):
-		"""
-		Detect and set the class/module (mod) and operation (op)
-		of the kernel e.g. torch.nn.functional / linear, torch / sigmoid.
-		The lookup sequence we use is
-			NVTX markers inserted by pyprof
-			NVTX markers inserted by PyTorch in bprop
-			NVTX markers inserted by PyTorch in fprop
-		It is a heuristic and not a foolproof method.
-		"""
-
-		def sanitize(name):
-			name = name.replace("torch","") \
-						.replace("autograd","") \
-						.replace("_backward","") \
-						.replace("::","") \
-						.replace("jit","") \
-						.replace("(anonymous namespace)","")
-			head, sep, tail = name.partition("Backward")
-			return head
-
-		#Check pyprof markers
-		for m in self.pyprofMarkers:
-			assert ("mod" in m) and ("op" in m) and ("args" in m)
-			t = eval(m)
-			self.op.append(t['op'])
-			self.mod.append(t['mod'])
-
-		if len(self.op):
-			return
-
-		#Check bprop kernel markers
-		for m in self.seqMarkers:
-			if ("backward, seq = " in m) or ("Backward, seq = " in m):
-				op = m.split(",")[0]
-				op = sanitize(op)
-				self.op.append(op)
-				self.mod.append('na')
-
-		if len(self.op):
-			return
-
-		#Check markers with "seq = "
-		for m in self.seqMarkers:
-			if ", seq = " in m:
-				op = m.split(",")[0]
-				self.op.append(op)
-				self.mod.append('na')
-
-		if len(self.op):
-			return
-
-		#If nothing else
-		if len(self.otherMarkers):
-			self.op.append(self.otherMarkers[0])
-		self.mod.append('na')
-
-	def print(self):
-		"""
-		Print kernel information. This is used by prof.py.
-		"""
-
-		a = lambda: None
-		a.kShortName = self.kShortName
-		a.kDuration = self.kDuration
-		#a.layerMarkers = self.layerMarkers
-		a.layer = self.layer
-		a.trace = self.traceMarkers
-		a.reprMarkers = self.reprMarkers
-		a.marker = self.pyprofMarkers
-		a.seqMarker = self.seqMarkers
-
-		a.seqId = self.seqId
-		a.subSeqId = self.subSeqId
-		a.altSeqId = self.altSeqId
-
-		a.dir = self.dir
-		a.mod = self.mod
-		a.op = self.op
-
-		a.tid = self.tid
-		a.device = self.device
-		a.stream = self.stream
-		a.grid = self.grid
-		a.block = self.block
-		a.kLongName = self.kLongName
-
-		print(a.__dict__)
--- a/apex/pyprof/parse/nvvp.py
+++ b/apex/pyprof/parse/nvvp.py
-import sys
-
-class NVVP(object):
-	"""
-	This class gets kernel information from the SQL (nvvp) database.
-	"""
-
-	driverT = "CUPTI_ACTIVITY_KIND_DRIVER"
-	runtimeT = "CUPTI_ACTIVITY_KIND_RUNTIME"
-	kernelT = "CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL"
-	markerT = "CUPTI_ACTIVITY_KIND_MARKER"
-	stringT = "StringTable"
-
-	def __init__(self, db):
-		self.db = db
-		self.markerId = 0
-
-	def getProfileStart(self):
-		"""
-		Get the profile start time
-		"""
-		profStart = sys.maxsize
-		for table in [self.driverT, self.runtimeT, self.kernelT, self.markerT]:
-			colname = "timestamp" if table is self.markerT else "start"
-			cmd = "select {} from {} ORDER BY {} ASC LIMIT 1".format(colname, table, colname)
-			result = self.db.select(cmd)
-			assert(len(result) <= 1)
-			if (len(result) == 1):
-				assert(colname in result[0])
-				t = result[0][colname]
-				if (t < profStart):
-					profStart = t
-		assert(profStart < sys.maxsize)
-		return profStart
-
-	def getString(self, id_):
-		"""
-		Get the string associated with an id.
-		"""
-		cmd = "select value from {} where _id_ = {}".format(self.stringT, id_)
-		result = self.db.select(cmd)
-		assert (len(result) == 1)
-		return result[0]['value']
-
-	def createMarkerTable(self):
-		"""
-		Create a temporary table and index it to speed up repeated SQL quesries.
-		The table is an INNER JOIN of CUPTI_ACTIVITY_KIND_MARKER with itself.
-		"""
-		cmd = 'CREATE TEMPORARY TABLE marker AS SELECT \
-					a._id_ as id, \
-					a.timestamp AS startTime, \
-					b.timestamp AS endTime, \
-					HEX(a.objectId) AS objectId, \
-					a.name AS name \
-					FROM {} AS a INNER JOIN {} AS b ON \
-					a.id = b.id and \
-					a.flags = 2 and b.flags = 4'.format(self.markerT, self.markerT)
-		self.db.execute(cmd)
-
-		self.db.execute('CREATE INDEX start_index ON marker (startTime)')
-		self.db.execute('CREATE INDEX end_index ON marker (endTime)')
-		self.db.execute('CREATE INDEX id_index ON marker (id)')
-
-	def getCPUInfo(self, corrId):
-		"""
-		Given the correlation id, get CPU start, end, thread id, process id.
-		The information can be in the runtime table or the driver table.
-		"""
-
-		#First look in the runtime table
-		cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.runtimeT, corrId);
-		result = self.db.select(cmd)
-		assert (len(result) <= 1)
-
-		if (len(result) == 0):
-			#Look in the driver table
-			cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.driverT, corrId);
-			result = self.db.select(cmd)
-
-		assert (len(result) == 1)
-		info = result[0]
-		start = info['start']
-		end = info['end']
-		pid = info['processId']
-		tid = info['threadId']
-		tid = tid & 0xffffffff	#convert to unsigned
-		assert (end > start)
-		return [start, end, pid, tid]
-
-	def getKernelInfo(self):
-		"""
-		Get GPU kernel info
-		"""
-		cmd = "select name,correlationId,start,end,deviceId,streamId,gridX,gridY,gridZ,blockX,blockY,blockZ from {}".format(self.kernelT)
-		result = self.db.select(cmd)
-		return result
-
-	def getMarkerInfo(self, objId, startTime, endTime):
-		"""
-		This function first finds all NVTX markers encapsulating
-		a runtime / driver kernel launch.
-		It then splits the markers into many lists.
-			layerMarkers : User added NVTX markers
-			traceMarkers : Call trace markers (inserted by pyprof)
-			reprMarkers  : Markers containing the extra_repr() of a module (inserted by pyprof)
-			pyprofMarkers: Markers containing args and kwargs (tensor shape, datatype etc.)
-			seqMarkers   : Markers containing PyTorch internal sequence markers (inserted by PyTorch)
-			altSeqMarkers: Markers inserted by PyTorch between two kernel launches. Needs better explanation.
-			otherMarkers : Markers not in either of the above categories.
-
-		We extract seqId from the seq and altSeq markers. The seqId is used in bprop.
-		We also extract information from the layerMarkers.
-		"""
-
-		layerMarkers = []
-		traceMarkers = []
-		reprMarkers = []
-		pyprofMarkers = []
-		seqMarkers = []
-		otherMarkers = []
-		altSeqMarkers = []
-		bprop = False
-
-		#Helper functions
-
-		def delete(objId, sTime):
-			"""
-			Delete rows from the temporary SQL table which are no longer required.
-			This speeds up future queries.
-			"""
-			margin = 0
-			cmd = 'DELETE FROM marker WHERE objectId = "{}" AND endTime < {}'.format(objId, sTime - margin)
-			#cmd = 'DELETE FROM marker WHERE endTime < {}'.format(sTime - margin)
-			self.db.execute(cmd)
-
-		def getLayerName(mlist):
-			"""
-			Get layer names from layer marker list.
-			"""
-			layers = []
-			assert(type(mlist) == list)
-			for m in mlist:
-				assert("layer:" in m)
-				l = m.split(":")[1]
-				layers.append(l)
-			return layers
-
-		def getSeqId(mlist):
-			"""
-			Get sequence ids from seq / alt seq marker list.
-			"""
-			ids = []
-			assert(type(mlist) == list)
-			for m in mlist:
-				assert(", seq = " in m)
-				seq = int(m.split("=")[1])
-				ids.append(seq)
-
-			#Remove duplicates
-			ids = list(set(ids))
-			ids.sort()
-			return ids
-
-		def seqcompare(elem):
-			"""
-			Sorting function for sequence markers
-			"""
-			assert (", seq = " in elem)
-			#sort by sequence id and then the string
-			l = elem.split(" = ")
-			return l[1] + l[0]
-
-		def prune(mlist):
-			"""
-			Remove markers with the same seqId and if the strings are similar.
-			This function works on a sorted sequence.
-			"""
-			assert (type(mlist) == list)
-			assert (len(mlist))
-			a = mlist[0:1]
-			for i in range(1,len(mlist)):
-				m = mlist[i]
-				pm = mlist[i-1]
-				name,seq = m.split(",")
-				pname,pseq = pm.split(",")
-				similar = (name in pname) or (pname in name)
-				if (seq == pseq) and similar:
-					continue
-				else:
-					a.append(m)
-			return a
-
-		def filterTrace(mlist):
-			"""
-			Filter trace markers to remove certain file names.
-			"""
-			assert (type(mlist) == list)
-			if len(mlist) == 0:
-				return mlist
-			mlist = mlist[-1]	#The last stack trace will be a super set.
-			mlist = eval(mlist)
-			mlist = mlist['traceMarker']
-			assert (type(mlist) == list)
-			mlist = list(filter(lambda x : "/torch/nn/modules/" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/nn/functional.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/tensor.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/autograd/__init__.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/_jit_internal.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/pyprof/nvtx/nvmarker.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/apex/optimizers/" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/_utils.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/optim/" not in x, mlist))
-			return mlist
-
-		#Find all encapsulating markers
-		cmd = 'SELECT id,name from marker where \
-				objectId = "{}" and \
-				startTime < {} and \
-				endTime > {} \
-				ORDER BY startTime ASC'.format(objId, startTime, endTime)
-		result = self.db.select(cmd)
-
-		#Bin markers into different lists
-		for r in result:
-			m = self.getString(r['name'])
-
-			#Hack: If its a known gradient checkpointing marker, ignore it.
-			if m.find("CheckpointFunctionBackward") >= 0:
-				continue
-
-			if ("_backward, seq =" in m) or ("Backward, seq =" in m) or ("Backward0, seq =" in m):
-				bprop = True
-
-			if ("mod" in m) and ("op" in m) and ("args" in m) and ("type" in m):
-				pyprofMarkers.append(m)
-			elif ("layer:" in m):
-				layerMarkers.append(m)
-			elif ("traceMarker" in m):
-				traceMarkers.append(m)
-			elif ("strRepr" in m):
-				reprMarkers.append(m)
-			elif (", seq = " in m):
-				seqMarkers.append(m)
-			else:
-				otherMarkers.append(m)
-
-		#Remove duplicates, sort and prune seqMarkers
-		if (len(seqMarkers)):
-			seqMarkers = list(set(seqMarkers))
-			seqMarkers.sort(key=seqcompare)
-			seqMarkers = prune(seqMarkers)
-
-		#Remove duplicates from otherMarkers
-		otherMarkers = list(set(otherMarkers))
-
-		#Get markers with seq id (inserted by PyTorch) from the previous kernel to the present kernel
-		#Only for fprop kernels
-		if (len(result) and not bprop):
-			loId = self.markerId
-			hiId = result[-1]['id']
-			self.markerId = hiId
-			
-			#Get markers between loId and hiId
-			cmd = 'SELECT id,name from marker where objectId = "{}" and id > {} and id < {} ORDER BY startTime ASC'.format(objId, loId, hiId)
-			result1 = self.db.select(cmd)
-
-			for r in result1:
-				m = self.getString(r['name'])
-				#Get only markers with seq id
-				if (", seq=" in m):
-					altSeqMarkers.append(m)
-
-			#Remove duplicates, sort and prune altSeqMarkers
-			if (len(altSeqMarkers)):
-				altSeqMarkers = list(set(altSeqMarkers))
-				altSeqMarkers.sort(key=seqcompare)
-				altSeqMarkers = prune(altSeqMarkers)
-
-		delete(objId, startTime)
-
-		return layerMarkers, filterTrace(traceMarkers), reprMarkers, pyprofMarkers, seqMarkers, otherMarkers, altSeqMarkers, getSeqId(seqMarkers), getSeqId(altSeqMarkers), getLayerName(layerMarkers)
--- a/apex/pyprof/parse/parse.py
+++ b/apex/pyprof/parse/parse.py
-#!/usr/bin/env python3
-
-"""
-Parse the SQL db and print a dictionary for every kernel.
-"""
-
-import sys
-import argparse
-from tqdm import tqdm
-
-from .db import DB
-from .kernel import Kernel
-from .nvvp import NVVP
-
-def parseArgs():
-	parser = argparse.ArgumentParser(prog=sys.argv[0], description="Parse SQL (nvvp) db.")
-	parser.add_argument("file",
-		type=str,
-		default=None,
-		help="SQL db (nvvp) file.")
-
-	args = parser.parse_args()
-	return args
-
-def main():
-	args = parseArgs()
-
-	db = DB(args.file)
-	nvvp = NVVP(db)
-
-	kInfo = nvvp.getKernelInfo()
-	if len(kInfo) == 0:
-		print("Found 0 kernels. Exiting.", file=sys.stderr)
-		db.close()
-		sys.exit(0)
-	else:
-		print("Found {} kernels. Getting info for each kernel.".format(len(kInfo)), file=sys.stderr)
-
-	nvvp.createMarkerTable()
-
-	prevSeqId = -1
-	prevSubSeqId = -1
-	prevOp = "na"
-
-	Kernel.profStart = nvvp.getProfileStart()
-
-	for i in tqdm(range(len(kInfo)), ascii=True):
-		info = kInfo[i]
-		k = Kernel()
-
-		#Set kernel info
-		k.setKernelInfo(info)
-
-		#Get, set kernel name
-		name = nvvp.getString(k.kNameId)
-		k.setKernelName(name)
-
-		#Get runtime info
-		info = nvvp.getCPUInfo(k.corrId)
-		k.setRunTimeInfo(info)
-
-		#Get and set marker and seqid info
-		info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime)
-		k.setMarkerInfo(info)
-
-		#If the seqId contains both 0 and non zero integers, remove 0.
-		if any(seq != 0 for seq in k.seqId) and (0 in k.seqId):
-			k.seqId.remove(0)
-
-		#Set direction (it uses seq id)
-		k.setDirection()
-
-		#Set op
-		k.setOp()
-
-		#The following code is based on heuristics.
-		#TODO: Refactor.
-		#Assign subSeqId, adjust seqId and altSeqId
-		#seqId can be 0.
-		#A kernel can have multiple seqIds both in fprop and bprop.
-		#In bprop, seqIds might not decrease monotonically. I have observed a few blips.
-		if len(k.seqId):
-			assert (k.dir in ["fprop", "bprop"])
-			if (k.dir == "fprop"):
-				#Check if there is a sequence id larger than the previous
-				inc = (k.seqId[-1] > prevSeqId)
-				if inc:
-					currSeqId = [x for x in k.seqId if x > prevSeqId][0]
-				else:
-					currSeqId = prevSeqId
-			else:
-				currSeqId = k.seqId[0]
-
-			#if ((currSeqId == prevSeqId) and (k.op == prevOp)):
-			if ((currSeqId == prevSeqId) and (k.op == prevOp)) or ((k.op[0] == "forward") and (k.op == prevOp) and (k.mod[0] in ["LSTMCell", "GRUCell", "RNNCell"])):
-				#The second condition is to trap cases when pytorch does not use cudnn for a LSTMCell.
-				k.subSeqId = prevSubSeqId + 1
-
-			prevSeqId = currSeqId
-			prevSubSeqId = k.subSeqId
-			prevOp = k.op
-
-			#Keep currSeqId in k.seqId, move everything else to k.altSeqId
-			for s in k.seqId:
-				if s != currSeqId:
-					k.seqId.remove(s)
-					k.altSeqId.append(s)
-
-			for s in k.altSeqId:
-				if s == currSeqId:
-					k.altSeqId.remove(s)
-
-			k.altSeqId = list(set(k.altSeqId))
-			if (len(k.altSeqId)):
-				(k.altSeqId).sort()
-
-		k.print()
-
-	db.close()
-
-if __name__ == '__main__':
-	main()
--- a/apex/pyprof/prof/__init__.py
+++ b/apex/pyprof/prof/__init__.py
-from . import data, prof
--- a/apex/pyprof/prof/__main__.py
+++ b/apex/pyprof/prof/__main__.py
-import warnings
-
-try:
-    from .prof import main
-except ImportError as e:
-    warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?")
-    raise e
-
-if __name__ == '__main__':
-    main()
--- a/apex/pyprof/prof/activation.py
+++ b/apex/pyprof/prof/activation.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Activation(OperatorLayerBase):
-	"""
-	This class handles the various activation functions.
-	"""
-
-	ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"]
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod in ["torch.nn.functional", "torch", "Tensor"])
-
-		#Filter out named parameters
-		args = list(filter(lambda x : x['name'] == '', args))
-
-		assert (len(args) >= 1)
-		arg = args[0]
-		assert (arg['type'] == "tensor")
-
-		self.i = arg
-		self.dir = d.dir
-
-	def params(self):
-		p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])])
-		return p
-
-	def flops(self):
-		direction = self.dir
-		tensor = self.i['shape']
-		t = self.i['dtype']
-
-		# TODO: revise
-		elems = Utility.numElems(tensor)
-		return elems
-
-	def bytes(self):
-		direction = self.dir
-		tensor = self.i['shape']
-		t = self.i['dtype']
-
-		elems = Utility.numElems(tensor)
-		elems = elems * (2 if direction == "fprop" else 3)
-
-		return elems * Utility.typeToBytes(t)
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/base.py
+++ b/apex/pyprof/prof/base.py
-from abc import ABC, abstractmethod
-
-class OperatorLayerBase(ABC):
-	"""
-	Base class for all layers and operators.
-	Every derived class should have the following functions.
-	"""
-
-	@abstractmethod
-	def tc(self):
-		"""
-		Tensor core usage by the kernel.
-		Return "1" (yes), "0" (no, but possible), "-" (not applicable)
-		"""
-		pass
-
-	@abstractmethod
-	def params(self):
-		"""
-		Kernel parameters to be printed.
-		"""
-		pass
-
-	@abstractmethod
-	def flops(self):
-		"""
-		Note that 1 FMA = 2 flops.
-		"""
-		pass
-
-	@abstractmethod
-	def bytes(self):
-		pass
-
-	@abstractmethod
-	def mod(self):
-		"""
-		Name of the module/class e.g. torch.nn.functional.
-		"""
-		pass
-
-	@abstractmethod
-	def op(self):
-		"""
-		Name of the operator e.g. sigmoid.
-		"""
-		pass