Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29
IFU-master-2022-07-29
96850dfa · Jithun Nair · GitHub · 87fc4125 · cc5f83b5 · 87fc4125
Unverified Commit 96850dfa authored Aug 15, 2022 by Jithun Nair Committed by GitHub Aug 15, 2022
20 changed files
--- a/apex/pyprof/examples/custom_func_module/custom_function.py
+++ b/apex/pyprof/examples/custom_func_module/custom_function.py
-#!/usr/bin/env python3
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-#Initialize pyprof
-pyprof.nvtx.init()
-class Foo(torch.autograd.Function):
-	@staticmethod
-	def forward(ctx, in1, in2):
-		out = in1 + in2		#This could be a custom C/C++ function.
-		return out
-	@staticmethod
-	def backward(ctx, grad):
-		in1_grad = grad		#This could be a custom C/C++ function.
-		in2_grad = grad		#This could be a custom C/C++ function.
-		return in1_grad, in2_grad
-#Hook the forward and backward functions to pyprof
-pyprof.nvtx.wrap(Foo, 'forward')
-pyprof.nvtx.wrap(Foo, 'backward')
-foo = Foo.apply
-x = torch.ones(4,4).cuda()
-y = torch.ones(4,4).cuda()
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = foo(x,y)
-	profiler.stop()
--- a/apex/pyprof/examples/custom_func_module/custom_module.py
+++ b/apex/pyprof/examples/custom_func_module/custom_module.py
-#!/usr/bin/env python3
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-pyprof.nvtx.init()
-class Foo(torch.nn.Module):
-    def __init__(self, size):
-        super(Foo, self).__init__()
-        self.n = torch.nn.Parameter(torch.ones(size))
-        self.m = torch.nn.Parameter(torch.ones(size))
-    def forward(self, input):
-        return self.n*input + self.m
-#Hook the forward function to pyprof
-pyprof.nvtx.wrap(Foo, 'forward')
-foo = Foo(4)
-foo.cuda()
-x = torch.ones(4).cuda()
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = foo(x)
-	profiler.stop()
--- a/apex/pyprof/examples/custom_func_module/test.sh
+++ b/apex/pyprof/examples/custom_func_module/test.sh
-#!/bin/bash
-set -e
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-parse="python $PYPROF/parse/parse.py"
-prof="python $PYPROF/prof/prof.py"
-for f in *.py
-do
-	base=`basename $f .py`
-	sql=$base.sql
-	dict=$base.dict
-	#NVprof
-	echo "nvprof -fo $sql python $f"
-	nvprof -fo $sql python $f
-	#Parse
-	echo $parse $sql
-	$parse $sql > $dict
-	#Prof
-	echo $prof $dict
-	$prof -w 130 $dict
-	\rm $sql $dict
-done
--- a/apex/pyprof/examples/imagenet/imagenet.py
+++ b/apex/pyprof/examples/imagenet/imagenet.py
-#!/usr/bin/env python3
-"""
-Example to run pyprof with imagenet models.
-"""
-import sys
-import torch
-import torch.nn as nn
-import torchvision.models as models
-import torch.cuda.profiler as profiler
-import argparse
-from apex import pyprof
-from apex.optimizers import FusedAdam
-def parseArgs():
-	parser = argparse.ArgumentParser(prog=sys.argv[0], description="Run popular imagenet models.")
-	parser.add_argument("-m",
-		type=str,
-		default="resnet50",
-		choices=["alexnet", "densenet121", "densenet161", "densenet169", "densenet201", "googlenet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3", "mobilenet_v2", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", "wide_resnet50_2", "wide_resnet101_2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0", "squeezenet1_0", "squeezenet1_1", "vgg11", "vgg11_bn", "vgg13", "vgg13_bn", "vgg16", "vgg16_bn", "vgg19", "vgg19_bn", "inception_v3"],
-		help="Model.")
-	parser.add_argument("-b",
-		type=int,
-		default=32,
-		help="Batch size.")
-	parser.add_argument("-o",
-		type=str,
-		default="adam",
-		choices=["adam", "sgd"],
-		help="Optimizer.")
-	args = parser.parse_args()
-	return args
-d = {
-	"alexnet":				{'H': 224, 'W': 224, 'opts': {}},
-	"densenet121":			{'H': 224, 'W': 224, 'opts': {}},
-	"densenet161":			{'H': 224, 'W': 224, 'opts': {}},
-	"densenet169":			{'H': 224, 'W': 224, 'opts': {}},
-	"densenet201":			{'H': 224, 'W': 224, 'opts': {}},
-	"googlenet":			{'H': 224, 'W': 224, 'opts': {'aux_logits': False}},
-	"mnasnet0_5":			{'H': 224, 'W': 224, 'opts': {}},
-	"mnasnet0_75":			{'H': 224, 'W': 224, 'opts': {}},
-	"mnasnet1_0":			{'H': 224, 'W': 224, 'opts': {}},
-	"mnasnet1_3":			{'H': 224, 'W': 224, 'opts': {}},
-	"mobilenet_v2":			{'H': 224, 'W': 224, 'opts': {}},
-	"resnet18":				{'H': 224, 'W': 224, 'opts': {}},
-	"resnet34":				{'H': 224, 'W': 224, 'opts': {}},
-	"resnet50":				{'H': 224, 'W': 224, 'opts': {}},
-	"resnet101":			{'H': 224, 'W': 224, 'opts': {}},
-	"resnet152":			{'H': 224, 'W': 224, 'opts': {}},
-	"resnext50_32x4d":		{'H': 224, 'W': 224, 'opts': {}},
-	"resnext101_32x8d":		{'H': 224, 'W': 224, 'opts': {}},
-	"wide_resnet50_2":		{'H': 224, 'W': 224, 'opts': {}},
-	"wide_resnet101_2":		{'H': 224, 'W': 224, 'opts': {}},
-	"shufflenet_v2_x0_5": 	{'H': 224, 'W': 224, 'opts': {}},
-	"shufflenet_v2_x1_0": 	{'H': 224, 'W': 224, 'opts': {}},
-	"shufflenet_v2_x1_5": 	{'H': 224, 'W': 224, 'opts': {}},
-	"shufflenet_v2_x2_0":	{'H': 224, 'W': 224, 'opts': {}},
-	"squeezenet1_0":		{'H': 224, 'W': 224, 'opts': {}},
-	"squeezenet1_1":		{'H': 224, 'W': 224, 'opts': {}},
-	"vgg11":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg11_bn":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg13":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg13_bn":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg16":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg16_bn":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg19":				{'H': 224, 'W': 224, 'opts': {}},
-	"vgg19_bn":				{'H': 224, 'W': 224, 'opts': {}},
-	"inception_v3":			{'H': 299, 'W': 299, 'opts': {'aux_logits': False}},
-	}
-def main():
-	args = parseArgs()
-	pyprof.nvtx.init()
-#	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
-	N = args.b
-	C = 3
-	H = d[args.m]['H']
-	W = d[args.m]['W']
-	opts = d[args.m]['opts']
-	classes = 1000
-	net = getattr(models, args.m)
-	net = net(**opts).cuda().half()
-	net.train()
-	x = torch.rand(N, C, H, W).cuda().half()
-	target = torch.empty(N, dtype=torch.long).random_(classes).cuda()
-	criterion = nn.CrossEntropyLoss().cuda()
-	if (args.o == "sgd"):
-		optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
-	elif (args.o == "adam"):
-		optimizer = FusedAdam(net.parameters())
-	else:
-		assert False
-	#Warm up without profiler
-	for i in range(2):
-		output = net(x)
-		loss = criterion(output, target)
-		optimizer.zero_grad()
-		loss.backward()
-		optimizer.step()
-	with torch.autograd.profiler.emit_nvtx():
-		profiler.start()
-		output = net(x)
-		loss = criterion(output, target)
-		optimizer.zero_grad()
-		loss.backward()
-		optimizer.step()
-		profiler.stop()
-if __name__ == "__main__":
-	main()
--- a/apex/pyprof/examples/imagenet/test.sh
+++ b/apex/pyprof/examples/imagenet/test.sh
-#!/bin/bash
-set -e
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-parse="python -m apex.pyprof.parse"
-prof="python -m apex.pyprof.prof"
-for net in "resnet50"
-do
-	for optim in adam sgd
-	do
-		for batch in 32 64
-		do
-			base="torchvision".$net.$optim.$batch
-			sql=$base.sql
-			dict=$base.dict
-			#NVprof
-			echo "nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch"
-			nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch
-			#Parse
-			echo $parse $sql
-			$parse $sql > $dict
-			#Prof
-			echo $prof $dict
-			$prof -w 130 $dict
-#			\rm $sql $dict
-		done
-	done
-done
--- a/apex/pyprof/examples/jit/README.md
+++ b/apex/pyprof/examples/jit/README.md
-*As of this writing, these examples do not work
-because of changes being proposed in PyTorch.*
-There are two ways to use PyTorch JIT
- - Scripting
- - Tracing
-In addition, we can JIT a
- - Stand alone function
- - Class / class method
-This directory has an example for each of the 4 cases.
-Intercepting (monkey patching) JITted code has a few extra steps,
-which are explained through comments.
--- a/apex/pyprof/examples/jit/jit_script_function.py
+++ b/apex/pyprof/examples/jit/jit_script_function.py
-#!/usr/bin/env python3
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-#The following creates an object "foo" of type ScriptModule
-#The new object has a function called "forward"
-@torch.jit.script
-def foo(x, y):
-	return torch.sigmoid(x) + y
-#Initialize pyprof after the JIT step
-pyprof.nvtx.init()
-#Assign a name to the object "foo"
-foo.__name__ = "foo"
-#Hook up the forward function to pyprof
-pyprof.nvtx.wrap(foo, 'forward')
-x = torch.zeros(4,4).cuda()
-y = torch.ones(4,4).cuda()
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = foo(x, y)
-	profiler.stop()
-	print(z)
--- a/apex/pyprof/examples/jit/jit_script_method.py
+++ b/apex/pyprof/examples/jit/jit_script_method.py
-#!/usr/bin/env python3
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-class Foo(torch.jit.ScriptModule):
-    def __init__(self, size):
-        super(Foo, self).__init__()
-        self.n = torch.nn.Parameter(torch.ones(size))
-        self.m = torch.nn.Parameter(torch.ones(size))
-    @torch.jit.script_method
-    def forward(self, input):
-        return self.n*input + self.m
-#Initialize pyprof after the JIT step
-pyprof.nvtx.init()
-#Hook up the forward function to pyprof
-pyprof.nvtx.wrap(Foo, 'forward')
-foo = Foo(4)
-foo.cuda()
-x = torch.ones(4).cuda()
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = foo(x)
-	profiler.stop()
-	print(z)
--- a/apex/pyprof/examples/jit/jit_trace_function.py
+++ b/apex/pyprof/examples/jit/jit_trace_function.py
-#!/usr/bin/env python3
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-def foo(x, y):
-	return torch.sigmoid(x) + y
-x = torch.zeros(4,4).cuda()
-y = torch.ones(4,4).cuda()
-#JIT the function using tracing
-#This returns an object of type ScriptModule with a forward method.
-traced_foo = torch.jit.trace(foo, (x,y))
-#Initialize pyprof after the JIT step
-pyprof.nvtx.init()
-#Assign a name to the object "traced_foo"
-traced_foo.__dict__['__name__'] = "foo"
-#Hook up the forward function to pyprof
-pyprof.nvtx.wrap(traced_foo, 'forward')
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = traced_foo(x, y)
-	profiler.stop()
-	print(z)
--- a/apex/pyprof/examples/jit/jit_trace_method.py
+++ b/apex/pyprof/examples/jit/jit_trace_method.py
-#!/usr/bin/env python3
-import torch
-import torch.cuda.profiler as profiler
-from apex import pyprof
-class Foo(torch.nn.Module):
-    def __init__(self, size):
-        super(Foo, self).__init__()
-        self.n = torch.nn.Parameter(torch.ones(size))
-        self.m = torch.nn.Parameter(torch.ones(size))
-    def forward(self, input):
-        return self.n*input + self.m
-foo = Foo(4)
-foo.cuda()
-x = torch.ones(4).cuda()
-#JIT the class using tracing
-traced_foo = torch.jit.trace(foo, x)
-#Initialize pyprof after the JIT step
-pyprof.nvtx.init()
-#Assign a name to the object "traced_foo"
-traced_foo.__dict__['__name__'] = "foo"
-#Hook up the forward function to pyprof
-pyprof.nvtx.wrap(traced_foo, 'forward')
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	z = traced_foo(x)
-	profiler.stop()
-	print(z)
--- a/apex/pyprof/examples/jit/test.sh
+++ b/apex/pyprof/examples/jit/test.sh
-#!/bin/bash
-set -e
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-parse="python $PYPROF/parse/parse.py"
-prof="python $PYPROF/prof/prof.py"
-for f in *.py
-do
-	base=`basename $f .py`
-	sql=$base.sql
-	dict=$base.dict
-	#NVprof
-	echo "nvprof -fo $sql python $f"
-	nvprof -fo $sql python $f
-	#Parse
-	echo $parse $sql
-	$parse $sql > $dict
-	#Prof
-	echo $prof $dict
-	$prof -w 130 $dict
-	\rm $sql $dict
-done
--- a/apex/pyprof/examples/lenet.py
+++ b/apex/pyprof/examples/lenet.py
-#!/usr/bin/env python3
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.cuda.profiler as profiler
-import torch.optim as optim
-from apex import pyprof
-pyprof.nvtx.init()
-class LeNet5(nn.Module):
-	def __init__(self):
-		super(LeNet5, self).__init__()
-		# 1 input image channel, 6 output channels, 5x5 square convolution
-		# kernel
-		self.conv1 = nn.Conv2d(1, 6, 5)
-		self.conv2 = nn.Conv2d(6, 16, 5)
-		# an affine operation: y = Wx + b
-		self.fc1 = nn.Linear(16 * 5 * 5, 120)
-		self.fc2 = nn.Linear(120, 84)
-		self.fc3 = nn.Linear(84, 10)
-	def forward(self, x):
-		# Max pooling over a (2, 2) window
-		x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
-		# If the size is a square you can only specify a single number
-		x = F.max_pool2d(F.relu(self.conv2(x)), 2)
-		x = x.view(-1, self.num_flat_features(x))
-		x = F.relu(self.fc1(x))
-		x = F.relu(self.fc2(x))
-		x = self.fc3(x)
-		return x
-	def num_flat_features(self, x):
-		size = x.size()[1:]  # all dimensions except the batch dimension
-		num_features = 1
-		for s in size:
-			num_features *= s
-		return num_features
-with torch.autograd.profiler.emit_nvtx():
-	net = LeNet5().cuda()
-	input = torch.randn(1, 1, 32, 32).cuda()
-	out = net(input)
-	target = torch.randn(10)			# a dummy target, for example
-	target = target.view(1, -1).cuda()	# make it the same shape as output
-	criterion = nn.MSELoss()
-	# create your optimizer
-	optimizer = optim.SGD(net.parameters(), lr=0.01)
-	# in your training loop:
-	optimizer.zero_grad()	# zero the gradient buffers
-	profiler.start()
-	output = net(input)
-	loss = criterion(output, target)
-	loss.backward()
-	optimizer.step()	# Does the update
-	profiler.stop()
--- a/apex/pyprof/examples/operators.py
+++ b/apex/pyprof/examples/operators.py
-#!/usr/bin/env python3
-"""
-This file checks all Python operators.
-"""
-import sys
-import torch
-import torch.cuda.profiler as profiler
-import operator
-import inspect
-#Import and initialize pyprof
-from apex import pyprof
-pyprof.nvtx.init()
-X = 1024
-Y = 1024
-fa = torch.rand(X, Y).cuda()
-fb = torch.rand(X, Y).cuda()
-fc = torch.rand(X, Y).cuda()
-ia = torch.randint(0, 100, (X, Y)).cuda()
-ib = torch.randint(0, 100, (X, Y)).cuda()
-sa = torch.ones(1,1).cuda()
-sb = torch.ones(1,1).cuda()
-ba = fa.byte()
-unaryOps = ["abs", "__abs__", "neg", "__neg__",]
-invertOps = ["inv", "invert", "__inv__", "__invert__",]	#imlemented only for byte tensors
-#pos, __pos__ is not implemented for tensors
-binaryOps = []
-binaryOps += [ "lt", "__lt__", "le", "__le__", "eq", "__eq__", "ne", "__ne__", "ge", "__ge__", "gt", "__gt__" ]
-binaryOps += [ "add", "__add__", "sub", "__sub__", "mul", "__mul__", "floordiv", "__floordiv__", "truediv", "__truediv__", "pow", "__pow__", "mod", "__mod__"]
-binaryOps += [ "and_", "__and__", "or_", "__or__", "xor", "__xor__", "lshift", "__lshift__", "rshift", "__rshift__"]
-inplaceOps = []
-inplaceOps += ["iadd", "__iadd__", "isub", "__isub__", "imul", "__imul__", "ifloordiv", "__ifloordiv__", "itruediv", "__itruediv__", "imod", "__imod__",]
-#ipow, __ipow__ is not implemented in pytorch
-inplaceOps += [ "iand", "__iand__", "ior", "__ior__", "ixor", "__ixor__", "ilshift", "__ilshift__", "irshift", "__irshift__",]
-matmulOps = [ "matmul", "__matmul__" ]
-inplacematmulOps = [ "imatmul", "__imatmul__" ]
-reverseIntBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rfloordiv__", "__rpow__",]
-reverseFloatBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__",]
-'''
-TODO
-.concat(a, b)
-.__concat__(a, b)
-.contains(a, b)
-.__contains__(a, b)
-.countOf(a, b)
-.delitem(a, b)
-.__delitem__(a, b)
-.getitem(a, b)
-.__getitem__(a, b)
-.indexOf(a, b)
-.setitem(a, b, c)
-.__setitem__(a, b, c)
-.length_hint(obj, default=0)
-.iconcat(a, b)
-.__iconcat__(a, b)
-.index(a)
-.__index__(a)
-'''
-#Context manager
-with torch.autograd.profiler.emit_nvtx():
-	#Start profiler
-	profiler.start()
-	for op in unaryOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(ia)
-	for op in invertOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(ba)
-	for op in binaryOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(ia, ib)
-		c = f(ia, 2)
-	for op in inplaceOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		ia = f(ia, ib)
-		ia = f(ia, 2)
-	for op in matmulOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		c = f(fa, fb)
-	for op in inplacematmulOps:
-		assert hasattr(operator, op)
-		f = getattr(operator, op)
-		assert inspect.isbuiltin(f)
-		fa = f(fa, fb)
-	for op in reverseIntBinaryOps:
-		assert hasattr(torch.Tensor, op)
-		f = getattr(torch.Tensor, op)
-		ia = f(ia, ib)
-	for op in reverseFloatBinaryOps:
-		assert hasattr(torch.Tensor, op)
-		f = getattr(torch.Tensor, op)
-		fa = f(fa, fb)
-	'''
-	#c = fa[3]
-	#c = fa[3][3]
-	#c = torch.min(fa, 3)
-	c = torch.sum(fa)
-	c = torch.max(fa)
-	c = -fa
-	#fc[2][2] = fa[2][2]
-	c = a_scalar and b_scalar
-	c = a_scalar or b_scalar
-	c = not a_scalar
-	c = a is b
-	c = a is not b
-	'''
-	#Stop profiler
-	profiler.stop()
--- a/apex/pyprof/examples/simple.py
+++ b/apex/pyprof/examples/simple.py
-#!/usr/bin/env python3
-"""
-This simple file provides an example of how to
- - import the pyprof library and initialize it
- - use the emit_nvtx context manager
- - start and stop the profiler
-Only kernels within profiler.start and profiler.stop calls are profiled.
-To profile
-$ nvprof -f -o simple.sql --profile-from-start off ./simple.py
-"""
-import sys
-import torch
-import torch.cuda.profiler as profiler
-#Import and initialize pyprof
-from apex import pyprof
-pyprof.nvtx.init()
-a = torch.randn(5, 5).cuda()
-b = torch.randn(5, 5).cuda()
-#Context manager
-with torch.autograd.profiler.emit_nvtx():
-	#Start profiler
-	profiler.start()
-	c = a + b
-	c = torch.mul(a,b)
-	c = torch.matmul(a,b)
-	c = torch.argmax(a, dim=1)
-	c = torch.nn.functional.pad(a, (1,1))
-	#Stop profiler
-	profiler.stop()
--- a/apex/pyprof/examples/user_annotation/README.md
+++ b/apex/pyprof/examples/user_annotation/README.md
-Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm) 
-are a useful tool to capture and observe events and code ranges etc. 
-Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP).
-While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option.
-NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel.
-The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks.
-Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all.
-### To run
-```sh
-nvprof -fo resnet.sql --profile-from-start off python resnet.py
-parse.py resnet.sql > resnet.dict
-prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict
-```
-The file `resnet.sql` can also be opened with NVVP as usual.
--- a/apex/pyprof/examples/user_annotation/resnet.py
+++ b/apex/pyprof/examples/user_annotation/resnet.py
-#!/usr/bin/env python3
-"""
-An example showing use of nested NVTX markers.
-"""
-import torch
-import torch.nn as nn
-import torch.cuda.profiler as profiler
-import torch.cuda.nvtx as nvtx
-from apex import pyprof
-pyprof.nvtx.init()
-def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
-	"""3x3 convolution with padding"""
-	return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-					 padding=dilation, groups=groups, bias=False, dilation=dilation)
-def conv1x1(in_planes, out_planes, stride=1):
-	"""1x1 convolution"""
-	return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-class Bottleneck(nn.Module):
-	expansion = 4
-	count = 1
-	def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-				 base_width=64, dilation=1, norm_layer=None):
-		super(Bottleneck, self).__init__()
-		if norm_layer is None:
-			norm_layer = nn.BatchNorm2d
-		width = int(planes * (base_width / 64.)) * groups
-		# Both self.conv2 and self.downsample layers downsample the input when stride != 1
-		self.conv1 = conv1x1(inplanes, width)
-		self.bn1 = norm_layer(width)
-		self.conv2 = conv3x3(width, width, stride, groups, dilation)
-		self.bn2 = norm_layer(width)
-		self.conv3 = conv1x1(width, planes * self.expansion)
-		self.bn3 = norm_layer(planes * self.expansion)
-		self.relu = nn.ReLU(inplace=True)
-		self.downsample = downsample
-		self.stride = stride
-		self.id = Bottleneck.count
-		Bottleneck.count += 1
-	def forward(self, x):
-		identity = x
-		nvtx.range_push("layer:Bottleneck_{}".format(self.id))
-		nvtx.range_push("layer:Conv1")
-		out = self.conv1(x)
-		nvtx.range_pop()
-		nvtx.range_push("layer:BN1")
-		out = self.bn1(out)
-		nvtx.range_pop()
-		nvtx.range_push("layer:ReLU")
-		out = self.relu(out)
-		nvtx.range_pop()
-		nvtx.range_push("layer:Conv2")
-		out = self.conv2(out)
-		nvtx.range_pop()
-		nvtx.range_push("layer:BN2")
-		out = self.bn2(out)
-		nvtx.range_pop()
-		nvtx.range_push("layer:ReLU")
-		out = self.relu(out)
-		nvtx.range_pop()
-		nvtx.range_push("layer:Conv3")
-		out = self.conv3(out)
-		nvtx.range_pop()
-		nvtx.range_push("layer:BN3")
-		out = self.bn3(out)
-		nvtx.range_pop()
-		if self.downsample is not None:
-			nvtx.range_push("layer:Downsample")
-			identity = self.downsample(x)
-			nvtx.range_pop()
-		nvtx.range_push("layer:Residual")
-		out += identity
-		nvtx.range_pop()
-		nvtx.range_push("layer:ReLU")
-		out = self.relu(out)
-		nvtx.range_pop()
-		nvtx.range_pop()
-		return out
-class ResNet(nn.Module):
-	def __init__(self, block, layers, num_classes=1000,
-				 groups=1, width_per_group=64, norm_layer=None):
-		super(ResNet, self).__init__()
-		if norm_layer is None:
-			norm_layer = nn.BatchNorm2d
-		self._norm_layer = norm_layer
-		self.inplanes = 64
-		self.dilation = 1
-		self.groups = groups
-		self.base_width = width_per_group
-		self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
-		self.bn1 = norm_layer(self.inplanes)
-		self.relu = nn.ReLU(inplace=True)
-		self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-		self.layer1 = self._make_layer(block, 64, layers[0])
-		self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-		self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-		self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-		self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-		self.fc = nn.Linear(512 * block.expansion, num_classes)
-		for m in self.modules():
-			if isinstance(m, nn.Conv2d):
-				nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-			elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-				nn.init.constant_(m.weight, 1)
-				nn.init.constant_(m.bias, 0)
-	def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
-		norm_layer = self._norm_layer
-		downsample = None
-		previous_dilation = self.dilation
-		if dilate:
-			self.dilation *= stride
-			stride = 1
-		if stride != 1 or self.inplanes != planes * block.expansion:
-			downsample = nn.Sequential(
-				conv1x1(self.inplanes, planes * block.expansion, stride),
-				norm_layer(planes * block.expansion),
-			)
-		layers = []
-		layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-							self.base_width, previous_dilation, norm_layer))
-		self.inplanes = planes * block.expansion
-		for _ in range(1, blocks):
-			layers.append(block(self.inplanes, planes, groups=self.groups,
-								base_width=self.base_width, dilation=self.dilation,
-								norm_layer=norm_layer))
-		return nn.Sequential(*layers)
-	def forward(self, x):
-		nvtx.range_push("layer:conv1_x")
-		x = self.conv1(x)
-		x = self.bn1(x)
-		x = self.relu(x)
-		x = self.maxpool(x)
-		nvtx.range_pop()
-		nvtx.range_push("layer:conv2_x")
-		x = self.layer1(x)
-		nvtx.range_pop()
-		nvtx.range_push("layer:conv3_x")
-		x = self.layer2(x)
-		nvtx.range_pop()
-		nvtx.range_push("layer:conv4_x")
-		x = self.layer3(x)
-		nvtx.range_pop()
-		nvtx.range_push("layer:conv5_x")
-		x = self.layer4(x)
-		nvtx.range_pop()
-		x = self.avgpool(x)
-		x = torch.flatten(x, 1)
-		nvtx.range_push("layer:FC")
-		x = self.fc(x)
-		nvtx.range_pop()
-		return x
-def resnet50():
-	return ResNet(Bottleneck, [3, 4, 6, 3])
-#Create model
-net = resnet50().cuda().half()
-net.train()
-#Create optimizer
-criterion = nn.CrossEntropyLoss().cuda()
-optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
-#Create synthetic input and label
-x = torch.rand(32, 3, 224, 224).cuda().half()
-target = torch.empty(32, dtype=torch.long).random_(1000).cuda()
-with torch.autograd.profiler.emit_nvtx():
-	profiler.start()
-	output = net(x)
-	loss = criterion(output, target)
-	optimizer.zero_grad()
-	loss.backward()
-	optimizer.step()
-	profiler.stop()
--- a/apex/pyprof/examples/user_annotation/test.sh
+++ b/apex/pyprof/examples/user_annotation/test.sh
-#!/bin/bash
-set -e
-SCRIPT=`realpath $0`
-SCRIPTPATH=`dirname $SCRIPT`
-PYPROF="$SCRIPTPATH/../.."
-parse="python $PYPROF/parse/parse.py"
-prof="python $PYPROF/prof/prof.py"
-for f in *.py
-do
-	base=`basename $f .py`
-	sql=$base.sql
-	dict=$base.dict
-	#NVprof
-	echo "nvprof -fo --profile-from-start off $sql python $f"
-	nvprof -fo $sql --profile-from-start off python $f
-	#Parse
-	echo $parse $sql
-	$parse $sql > $dict
-	#Prof
-	echo $prof $dict
-	#$prof -w 130 $dict
-	$prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict
-	\rm $sql $dict
-done
--- a/apex/pyprof/nvtx/__init__.py
+++ b/apex/pyprof/nvtx/__init__.py
-from .nvmarker import init
-from .nvmarker import add_wrapper as wrap
--- a/apex/pyprof/nvtx/nvmarker.py
+++ b/apex/pyprof/nvtx/nvmarker.py
-"""
-This file intercepts (monkey patches) the following functions and adds NVTX markers.
-	torch.*
-	torch.Tensor.*
-	torch.nn.functional.*
-	torch.nn.*.forward
-The NVTX markers (one or more) contain the following information
-	call trace (a list of file_name:line_number)
-	extra_repr() from torch.nn modules
-	module/class name
-	function name
-	inputs (args and kwargs)
-		scalar: name, type and value
-		tensor: name, shape and datatype
-		numpy: name, shape and datatype
-		list/tuple: a sequence of scalars or tensors or numpy arrays
-"""
-import torch
-import torch.cuda.nvtx as nvtx
-import numpy
-import inspect as ins
-import traceback
-import math
-def isfunc(mod, f):
-	assert hasattr(mod, f)
-	attr = getattr(mod, f)
-	#Ignore functions like _add
-	if (len(f) >= 2):
-		if f[0] == "_" and f[1] != "_":
-			return False
-	#Ignore functions from this list
-	ignore = ['__all__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__builtins__', '__cached__', '__class__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__file__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__index__', '__init__', '__init_subclass__', '__iter__', '__len__', '__loader__', '__module__', '__name__', '__new__', '__nonzero__', '__package__', '__path__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__spec__', '__str__', '__subclasshook__', '__version__', '__weakref__']
-	#Add functions to this list if they cause recursion
-	ignore += ['size', 'tolist', 'dim', 'is_storage', 'item']
-	if f in ignore:
-		return False
-	return ins.ismethod(attr) or ins.isfunction(attr) or ins.ismethoddescriptor(attr) or ins.isbuiltin(attr)
-def traceMarker(stack):
-	d = {}
-	cadena = []
-	for i in range(len(stack)-1):
-		fi = stack[i]
-		t = "{}:{}".format(fi.filename, fi.lineno)
-		cadena.append(t)
-	d['traceMarker'] = cadena
-	return str(d)
-def modMarker(mod, fn_name, args):
-	"""
-	Returns the stringified extra_repr() of a module.
-	"""
-	assert(fn_name == 'forward')
-	assert(len(args) > 0)
-	d = {}
-	d['mod'] = mod.__name__
-	d['strRepr'] = args[0].extra_repr()
-	return str(d)
-def add_wrapper(mod, fn_name):
-	assert isfunc(mod, fn_name)
-	# Get a pointer to the original function
-	func = getattr(mod, fn_name)
-	# Check if the mod has a string representation
-	# and is not a Script or Traced module (used by JIT)
-	s = hasattr(mod, "extra_repr") and (type(mod) is not torch.jit.ScriptModule) and (type(mod) is not torch.jit.TopLevelTracedModule)
-	def wrapper_func(*args, **kwargs):
-		# Extract the stacktrace
-		stack = traceback.extract_stack()
-		# Push trace marker
-		nvtx.range_push(traceMarker(stack))
-		# Push module marker
-		if s:
-			m = modMarker(mod, fn_name, args)
-			nvtx.range_push(m)
-		# Create and push argument marker
-		cadena = argMarker(mod, fn_name, args, kwargs)
-		nvtx.range_push(cadena)
-		# Call the original function
-		result = func(*args, **kwargs)
-		# Pop argumet marker
-		nvtx.range_pop()
-		# Pop module marker
-		if s:
-			nvtx.range_pop()
-		# Pop trace marker
-		nvtx.range_pop()
-		return result
-	setattr(mod, fn_name, wrapper_func)
-def argMarker(mod, op, args, kwargs):
-	#For this function args is a tuple and kwargs is a dict
-	def tensor(arg, name=""):
-		a = {}
-		a['name'] = name
-		a['type'] = "tensor"
-		a['shape'] = tuple(arg.size())
-		a['dtype'] = str(arg.dtype).split(".")[-1]
-		cadena['args'].append(a)
-	def ndarray(arg, name=""):
-		a = {}
-		a['name'] = name
-		a['type'] = "ndarray"
-		a['shape'] = arg.shape
-		a['dtype'] = str(arg.dtype).split(".")[-1]
-		cadena['args'].append(a)
-	def seq(arg, name=""):
-		assert issequence(arg)
-		a = {}
-		a['name'] = name
-		if isinstance(arg, list):
-			a['type'] = "list"
-			a['value'] = arg
-		else:
-			a['type'] = "tuple"
-			# The arg could be torch.Size, which is a subclass of tuple
-			# Therefore, explicitly convert to tuple
-			a['value'] = tuple(arg)
-		cadena['args'].append(a)
-	def scalar(arg, name=""):
-		a = {}
-		a['name'] = name
-		a['type'] = type(arg).__name__
-		#handle the case when the argument is +/- inf or nan
-		if arg == float('inf'):
-			a['value'] = "inf"
-		elif arg == float('-inf'):
-			a['value'] = "-inf"
-		elif isinstance(arg, float) and math.isnan(arg):
-			a['value'] = "nan"
-		else:
-			a['value'] = arg
-		cadena['args'].append(a)
-	def isscalar(arg):
-		return (type(arg) is int) or (type(arg) is float) or (type(arg) is bool) or (arg is None) or (type(arg) is str)
-	def issequence(arg):
-		return isinstance(arg, list) or isinstance(arg, tuple)
-	def foo(args, name):
-		#args should be an iterable sequence e.g. list or tuple
-		for arg in args:
-			if isinstance(arg, torch.Tensor):
-				if arg.dim() == 0:
-					scalar(arg.item(), name)
-				else:
-					tensor(arg, name)
-			elif isinstance(arg, numpy.ndarray):
-				ndarray(arg, name)
-			elif (isscalar(arg)):
-				scalar(arg, name)
-			elif issequence(arg):
-				if (len(arg) == 0) or isscalar(arg[0]):	#An empty sequence or a sequence of scalars
-					seq(arg, name)
-				else:	# A sequence of tensors or numpy arrays
-					foo(arg, name)
-			'''
-			else:
-				print("The following arg is none of Tensor, numpy array, scalar but a %s" % (str(type(arg))))
-				print("Mod: %s" % str(mod.__name__))
-				print("Op: %s" % str(op))
-				print(dir(arg))
-			'''
-	cadena = {}
-	cadena['mod'] = mod.__name__
-	cadena['op'] = op
-	cadena['args'] = []
-	foo(args, "")
-	for k,v in kwargs.items():
-		foo((v,), k)
-	return str(cadena)
-def patchClass(cls):
-	for f in dir(cls):
-		if isfunc(cls, f):
-			add_wrapper(cls, f)
-def init():
-	string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
-			"soon be removed from Apex.  Please visit\n" + \
-			"https://github.com/NVIDIA/PyProf\n" + \
-			"for the latest version.\n\n"
-	# print regardless of warning state
-	print(string)
-	print("Initializing NVTX monkey patches")
-	for cls in [torch, torch.Tensor, torch.nn.functional,]:
-		patchClass(cls)
-	for cls in [torch.nn.RNN, torch.nn.RNNCell, torch.nn.LSTM, torch.nn.LSTMCell, torch.nn.GRU, torch.nn.GRUCell]:
-		if isfunc(cls, 'forward'):
-			add_wrapper(cls, 'forward')
-	print("Done with NVTX monkey patching")
--- a/apex/pyprof/parse/__init__.py
+++ b/apex/pyprof/parse/__init__.py