Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29
IFU-master-2022-07-29
96850dfa · Jithun Nair · GitHub · 87fc4125 · cc5f83b5 · 87fc4125
Unverified Commit 96850dfa authored Aug 15, 2022 by Jithun Nair Committed by GitHub Aug 15, 2022
20 changed files
--- a/apex/pyprof/parse/__main__.py
+++ b/apex/pyprof/parse/__main__.py
-import warnings
-
-try:
-    from .parse import main
-except ImportError as e:
-    warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)")
-    raise e
-
-if __name__ == '__main__':
-    main()
--- a/apex/pyprof/parse/db.py
+++ b/apex/pyprof/parse/db.py
-import sys, sqlite3
-
-class DB(object):
-	"""
-	This class provides functions for DB operations
-	with exception handling.
-	"""
-
-	def __init__(self, dbFile):
-		try:
-			conn = sqlite3.connect(dbFile)
-			conn.row_factory = sqlite3.Row
-			c = conn.cursor()
-		except:
-			print("Error opening {}".format(dbFile))
-			sys.exit(1)
-
-		self.conn = conn
-		self.c = c
-
-	def select(self, cmd):
-		try:
-			self.c.execute(cmd)
-			#rows = self.c.fetchall()
-			rows = [dict(row) for row in self.c.fetchall()]
-		except sqlite3.Error as e:
-			print(e)
-			sys.exit(1)
-		except:
-			print("Uncaught error in SQLite access while executing {}".format(cmd))
-			sys.exit(1)
-
-		#print(rows)
-		return rows
-
-	def insert(self, cmd, data):
-		try:
-			self.c.execute(cmd, data)
-		except sqlite3.Error as e:
-			print(e)
-			sys.exit(1)
-		except:
-			print("Uncaught error in SQLite access while executing {}".format(cmd))
-			sys.exit(1)
-
-	def execute(self, cmd):
-		try:
-			self.c.execute(cmd)
-		except sqlite3.Error as e:
-			print(e)
-			sys.exit(1)
-		except:
-			print("Uncaught error in SQLite access while executing {}".format(cmd))
-			sys.exit(1)
-
-	def commit(self):
-		self.conn.commit()
-
-	def close(self):
-		self.c.close()
-		self.conn.close()
--- a/apex/pyprof/parse/kernel.py
+++ b/apex/pyprof/parse/kernel.py
-import cxxfilt, struct, binascii
-
-#Helper functions
-
-def demangle(name):
-	"""
-	Demangle a C++ string
-	"""
-	return cxxfilt.demangle(name)
-
-def encode_object_id(pid, tid):
-	"""
-	Given process id (pid) and thread id (tid), return the object id.
-	object id = pid (little endian 4 bytes) + tid (little endian 8 bytes)
-	"""
-	objId = struct.pack('<i', pid) + struct.pack('<q',tid)
-	objId = binascii.hexlify(objId).decode('ascii').upper()
-	return objId
-
-def getShortName(name):
-	"""
-	Returns a shorter kernel name
-	"""
-	sname = name.split("<")[0] \
-				.replace("void ", "") \
-				.replace("at::","") \
-				.replace("cuda::", "") \
-				.replace("native::","") \
-				.replace("(anonymous namespace)::", "")
-	sname = sname.split("(")[0]
-	return sname
-
-class Kernel(object):
-	"""
-	This class stores information about a kernel.
-	"""
-
-	kernels = []
-	profStart = 0
-
-	def __init__(self):
-		self.kNameId = None
-		self.kShortName = None
-		self.kLongName = None
-		self.kStartTime = None	#GPU start time
-		self.kEndTime = None	#GPU end time
-		self.kDuration = None
-		self.device = None
-		self.stream = None
-		self.grid = ()
-		self.block = ()
-		self.corrId = None
-		self.rStartTime = None	#CPU start time
-		self.rEndTime = None	#CPU end time
-		self.rDuration = None
-		self.tid = None
-		self.pid = None
-		self.objId = None
-		self.timeOffset = None
-
-		self.layerMarkers = []
-		self.traceMarkers = []
-		self.reprMarkers = []
-		self.pyprofMarkers = []
-		self.seqMarkers = []
-		self.otherMarkers = []
-		self.altMarkers = []
-		self.seqId = []
-		self.altSeqId = []
-		self.layer = []
-
-		self.subSeqId = None
-		self.dir = None
-		self.mod = []
-		self.op = []
-
-	def setKernelInfo(self, info):
-		self.kNameId = info['name']
-		self.corrId = int(info['correlationId'])
-		start = int(info['start'])
-		end = int(info['end'])
-		assert end > start, "This assertion can fail for very large profiles. It usually fails when start = end = 0."
-		self.kStartTime = start
-		self.kEndTime = end
-		self.kDuration = end - start
-		assert (start > Kernel.profStart)
-		self.device = int(info['deviceId'])
-		self.stream = int(info['streamId'])
-		self.grid = (info['gridX'], info['gridY'], info['gridZ'])
-		self.block = (info['blockX'], info['blockY'], info['blockZ'])
-		self.timeOffset = Kernel.profStart
-
-	def setKernelName(self, name):
-		cadena = demangle(name)
-		self.kLongName = cadena
-		self.kShortName = getShortName(cadena)
-
-	def setRunTimeInfo(self, info):
-		start, end, pid, tid = info
-		self.rStartTime = start
-		self.rEndTime = end
-		self.rDuration = end - start
-		self.pid = pid
-		self.tid = tid
-		self.objId = encode_object_id(pid, tid)
-
-	def setMarkerInfo(self, info):
-		self.layerMarkers, self.traceMarkers, self.reprMarkers, self.pyprofMarkers, self.seqMarkers, self.otherMarkers, self.altMarkers, self.seqId, self.altSeqId, self.layer = info
-		self.subSeqId = 0
-
-	def setDirection(self):
-		"""
-		Set direction (fprop, bprop) based on PyTorch sequence markers.
-		It is a heuristic and not a foolproof method.
-		"""
-		if	any("Backward, seq = " in x for x in self.seqMarkers) or \
-			any("backward, seq = " in x for x in self.seqMarkers) or \
-			any("Backward0, seq = " in x for x in self.seqMarkers):
-			self.dir = "bprop"
-		else:
-			self.dir = "fprop"
-
-	def setOp(self):
-		"""
-		Detect and set the class/module (mod) and operation (op)
-		of the kernel e.g. torch.nn.functional / linear, torch / sigmoid.
-		The lookup sequence we use is
-			NVTX markers inserted by pyprof
-			NVTX markers inserted by PyTorch in bprop
-			NVTX markers inserted by PyTorch in fprop
-		It is a heuristic and not a foolproof method.
-		"""
-
-		def sanitize(name):
-			name = name.replace("torch","") \
-						.replace("autograd","") \
-						.replace("_backward","") \
-						.replace("::","") \
-						.replace("jit","") \
-						.replace("(anonymous namespace)","")
-			head, sep, tail = name.partition("Backward")
-			return head
-
-		#Check pyprof markers
-		for m in self.pyprofMarkers:
-			assert ("mod" in m) and ("op" in m) and ("args" in m)
-			t = eval(m)
-			self.op.append(t['op'])
-			self.mod.append(t['mod'])
-
-		if len(self.op):
-			return
-
-		#Check bprop kernel markers
-		for m in self.seqMarkers:
-			if ("backward, seq = " in m) or ("Backward, seq = " in m):
-				op = m.split(",")[0]
-				op = sanitize(op)
-				self.op.append(op)
-				self.mod.append('na')
-
-		if len(self.op):
-			return
-
-		#Check markers with "seq = "
-		for m in self.seqMarkers:
-			if ", seq = " in m:
-				op = m.split(",")[0]
-				self.op.append(op)
-				self.mod.append('na')
-
-		if len(self.op):
-			return
-
-		#If nothing else
-		if len(self.otherMarkers):
-			self.op.append(self.otherMarkers[0])
-		self.mod.append('na')
-
-	def print(self):
-		"""
-		Print kernel information. This is used by prof.py.
-		"""
-
-		a = lambda: None
-		a.kShortName = self.kShortName
-		a.kDuration = self.kDuration
-		#a.layerMarkers = self.layerMarkers
-		a.layer = self.layer
-		a.trace = self.traceMarkers
-		a.reprMarkers = self.reprMarkers
-		a.marker = self.pyprofMarkers
-		a.seqMarker = self.seqMarkers
-
-		a.seqId = self.seqId
-		a.subSeqId = self.subSeqId
-		a.altSeqId = self.altSeqId
-
-		a.dir = self.dir
-		a.mod = self.mod
-		a.op = self.op
-
-		a.tid = self.tid
-		a.device = self.device
-		a.stream = self.stream
-		a.grid = self.grid
-		a.block = self.block
-		a.kLongName = self.kLongName
-
-		print(a.__dict__)
--- a/apex/pyprof/parse/nvvp.py
+++ b/apex/pyprof/parse/nvvp.py
-import sys
-
-class NVVP(object):
-	"""
-	This class gets kernel information from the SQL (nvvp) database.
-	"""
-
-	driverT = "CUPTI_ACTIVITY_KIND_DRIVER"
-	runtimeT = "CUPTI_ACTIVITY_KIND_RUNTIME"
-	kernelT = "CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL"
-	markerT = "CUPTI_ACTIVITY_KIND_MARKER"
-	stringT = "StringTable"
-
-	def __init__(self, db):
-		self.db = db
-		self.markerId = 0
-
-	def getProfileStart(self):
-		"""
-		Get the profile start time
-		"""
-		profStart = sys.maxsize
-		for table in [self.driverT, self.runtimeT, self.kernelT, self.markerT]:
-			colname = "timestamp" if table is self.markerT else "start"
-			cmd = "select {} from {} ORDER BY {} ASC LIMIT 1".format(colname, table, colname)
-			result = self.db.select(cmd)
-			assert(len(result) <= 1)
-			if (len(result) == 1):
-				assert(colname in result[0])
-				t = result[0][colname]
-				if (t < profStart):
-					profStart = t
-		assert(profStart < sys.maxsize)
-		return profStart
-
-	def getString(self, id_):
-		"""
-		Get the string associated with an id.
-		"""
-		cmd = "select value from {} where _id_ = {}".format(self.stringT, id_)
-		result = self.db.select(cmd)
-		assert (len(result) == 1)
-		return result[0]['value']
-
-	def createMarkerTable(self):
-		"""
-		Create a temporary table and index it to speed up repeated SQL quesries.
-		The table is an INNER JOIN of CUPTI_ACTIVITY_KIND_MARKER with itself.
-		"""
-		cmd = 'CREATE TEMPORARY TABLE marker AS SELECT \
-					a._id_ as id, \
-					a.timestamp AS startTime, \
-					b.timestamp AS endTime, \
-					HEX(a.objectId) AS objectId, \
-					a.name AS name \
-					FROM {} AS a INNER JOIN {} AS b ON \
-					a.id = b.id and \
-					a.flags = 2 and b.flags = 4'.format(self.markerT, self.markerT)
-		self.db.execute(cmd)
-
-		self.db.execute('CREATE INDEX start_index ON marker (startTime)')
-		self.db.execute('CREATE INDEX end_index ON marker (endTime)')
-		self.db.execute('CREATE INDEX id_index ON marker (id)')
-
-	def getCPUInfo(self, corrId):
-		"""
-		Given the correlation id, get CPU start, end, thread id, process id.
-		The information can be in the runtime table or the driver table.
-		"""
-
-		#First look in the runtime table
-		cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.runtimeT, corrId);
-		result = self.db.select(cmd)
-		assert (len(result) <= 1)
-
-		if (len(result) == 0):
-			#Look in the driver table
-			cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.driverT, corrId);
-			result = self.db.select(cmd)
-
-		assert (len(result) == 1)
-		info = result[0]
-		start = info['start']
-		end = info['end']
-		pid = info['processId']
-		tid = info['threadId']
-		tid = tid & 0xffffffff	#convert to unsigned
-		assert (end > start)
-		return [start, end, pid, tid]
-
-	def getKernelInfo(self):
-		"""
-		Get GPU kernel info
-		"""
-		cmd = "select name,correlationId,start,end,deviceId,streamId,gridX,gridY,gridZ,blockX,blockY,blockZ from {}".format(self.kernelT)
-		result = self.db.select(cmd)
-		return result
-
-	def getMarkerInfo(self, objId, startTime, endTime):
-		"""
-		This function first finds all NVTX markers encapsulating
-		a runtime / driver kernel launch.
-		It then splits the markers into many lists.
-			layerMarkers : User added NVTX markers
-			traceMarkers : Call trace markers (inserted by pyprof)
-			reprMarkers  : Markers containing the extra_repr() of a module (inserted by pyprof)
-			pyprofMarkers: Markers containing args and kwargs (tensor shape, datatype etc.)
-			seqMarkers   : Markers containing PyTorch internal sequence markers (inserted by PyTorch)
-			altSeqMarkers: Markers inserted by PyTorch between two kernel launches. Needs better explanation.
-			otherMarkers : Markers not in either of the above categories.
-
-		We extract seqId from the seq and altSeq markers. The seqId is used in bprop.
-		We also extract information from the layerMarkers.
-		"""
-
-		layerMarkers = []
-		traceMarkers = []
-		reprMarkers = []
-		pyprofMarkers = []
-		seqMarkers = []
-		otherMarkers = []
-		altSeqMarkers = []
-		bprop = False
-
-		#Helper functions
-
-		def delete(objId, sTime):
-			"""
-			Delete rows from the temporary SQL table which are no longer required.
-			This speeds up future queries.
-			"""
-			margin = 0
-			cmd = 'DELETE FROM marker WHERE objectId = "{}" AND endTime < {}'.format(objId, sTime - margin)
-			#cmd = 'DELETE FROM marker WHERE endTime < {}'.format(sTime - margin)
-			self.db.execute(cmd)
-
-		def getLayerName(mlist):
-			"""
-			Get layer names from layer marker list.
-			"""
-			layers = []
-			assert(type(mlist) == list)
-			for m in mlist:
-				assert("layer:" in m)
-				l = m.split(":")[1]
-				layers.append(l)
-			return layers
-
-		def getSeqId(mlist):
-			"""
-			Get sequence ids from seq / alt seq marker list.
-			"""
-			ids = []
-			assert(type(mlist) == list)
-			for m in mlist:
-				assert(", seq = " in m)
-				seq = int(m.split("=")[1])
-				ids.append(seq)
-
-			#Remove duplicates
-			ids = list(set(ids))
-			ids.sort()
-			return ids
-
-		def seqcompare(elem):
-			"""
-			Sorting function for sequence markers
-			"""
-			assert (", seq = " in elem)
-			#sort by sequence id and then the string
-			l = elem.split(" = ")
-			return l[1] + l[0]
-
-		def prune(mlist):
-			"""
-			Remove markers with the same seqId and if the strings are similar.
-			This function works on a sorted sequence.
-			"""
-			assert (type(mlist) == list)
-			assert (len(mlist))
-			a = mlist[0:1]
-			for i in range(1,len(mlist)):
-				m = mlist[i]
-				pm = mlist[i-1]
-				name,seq = m.split(",")
-				pname,pseq = pm.split(",")
-				similar = (name in pname) or (pname in name)
-				if (seq == pseq) and similar:
-					continue
-				else:
-					a.append(m)
-			return a
-
-		def filterTrace(mlist):
-			"""
-			Filter trace markers to remove certain file names.
-			"""
-			assert (type(mlist) == list)
-			if len(mlist) == 0:
-				return mlist
-			mlist = mlist[-1]	#The last stack trace will be a super set.
-			mlist = eval(mlist)
-			mlist = mlist['traceMarker']
-			assert (type(mlist) == list)
-			mlist = list(filter(lambda x : "/torch/nn/modules/" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/nn/functional.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/tensor.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/autograd/__init__.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/_jit_internal.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/pyprof/nvtx/nvmarker.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/apex/optimizers/" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/_utils.py" not in x, mlist))
-			mlist = list(filter(lambda x : "/torch/optim/" not in x, mlist))
-			return mlist
-
-		#Find all encapsulating markers
-		cmd = 'SELECT id,name from marker where \
-				objectId = "{}" and \
-				startTime < {} and \
-				endTime > {} \
-				ORDER BY startTime ASC'.format(objId, startTime, endTime)
-		result = self.db.select(cmd)
-
-		#Bin markers into different lists
-		for r in result:
-			m = self.getString(r['name'])
-
-			#Hack: If its a known gradient checkpointing marker, ignore it.
-			if m.find("CheckpointFunctionBackward") >= 0:
-				continue
-
-			if ("_backward, seq =" in m) or ("Backward, seq =" in m) or ("Backward0, seq =" in m):
-				bprop = True
-
-			if ("mod" in m) and ("op" in m) and ("args" in m) and ("type" in m):
-				pyprofMarkers.append(m)
-			elif ("layer:" in m):
-				layerMarkers.append(m)
-			elif ("traceMarker" in m):
-				traceMarkers.append(m)
-			elif ("strRepr" in m):
-				reprMarkers.append(m)
-			elif (", seq = " in m):
-				seqMarkers.append(m)
-			else:
-				otherMarkers.append(m)
-
-		#Remove duplicates, sort and prune seqMarkers
-		if (len(seqMarkers)):
-			seqMarkers = list(set(seqMarkers))
-			seqMarkers.sort(key=seqcompare)
-			seqMarkers = prune(seqMarkers)
-
-		#Remove duplicates from otherMarkers
-		otherMarkers = list(set(otherMarkers))
-
-		#Get markers with seq id (inserted by PyTorch) from the previous kernel to the present kernel
-		#Only for fprop kernels
-		if (len(result) and not bprop):
-			loId = self.markerId
-			hiId = result[-1]['id']
-			self.markerId = hiId
-			
-			#Get markers between loId and hiId
-			cmd = 'SELECT id,name from marker where objectId = "{}" and id > {} and id < {} ORDER BY startTime ASC'.format(objId, loId, hiId)
-			result1 = self.db.select(cmd)
-
-			for r in result1:
-				m = self.getString(r['name'])
-				#Get only markers with seq id
-				if (", seq=" in m):
-					altSeqMarkers.append(m)
-
-			#Remove duplicates, sort and prune altSeqMarkers
-			if (len(altSeqMarkers)):
-				altSeqMarkers = list(set(altSeqMarkers))
-				altSeqMarkers.sort(key=seqcompare)
-				altSeqMarkers = prune(altSeqMarkers)
-
-		delete(objId, startTime)
-
-		return layerMarkers, filterTrace(traceMarkers), reprMarkers, pyprofMarkers, seqMarkers, otherMarkers, altSeqMarkers, getSeqId(seqMarkers), getSeqId(altSeqMarkers), getLayerName(layerMarkers)
--- a/apex/pyprof/parse/parse.py
+++ b/apex/pyprof/parse/parse.py
-#!/usr/bin/env python3
-
-"""
-Parse the SQL db and print a dictionary for every kernel.
-"""
-
-import sys
-import argparse
-from tqdm import tqdm
-
-from .db import DB
-from .kernel import Kernel
-from .nvvp import NVVP
-
-def parseArgs():
-	parser = argparse.ArgumentParser(prog=sys.argv[0], description="Parse SQL (nvvp) db.")
-	parser.add_argument("file",
-		type=str,
-		default=None,
-		help="SQL db (nvvp) file.")
-
-	args = parser.parse_args()
-	return args
-
-def main():
-	args = parseArgs()
-
-	db = DB(args.file)
-	nvvp = NVVP(db)
-
-	kInfo = nvvp.getKernelInfo()
-	if len(kInfo) == 0:
-		print("Found 0 kernels. Exiting.", file=sys.stderr)
-		db.close()
-		sys.exit(0)
-	else:
-		print("Found {} kernels. Getting info for each kernel.".format(len(kInfo)), file=sys.stderr)
-
-	nvvp.createMarkerTable()
-
-	prevSeqId = -1
-	prevSubSeqId = -1
-	prevOp = "na"
-
-	Kernel.profStart = nvvp.getProfileStart()
-
-	for i in tqdm(range(len(kInfo)), ascii=True):
-		info = kInfo[i]
-		k = Kernel()
-
-		#Set kernel info
-		k.setKernelInfo(info)
-
-		#Get, set kernel name
-		name = nvvp.getString(k.kNameId)
-		k.setKernelName(name)
-
-		#Get runtime info
-		info = nvvp.getCPUInfo(k.corrId)
-		k.setRunTimeInfo(info)
-
-		#Get and set marker and seqid info
-		info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime)
-		k.setMarkerInfo(info)
-
-		#If the seqId contains both 0 and non zero integers, remove 0.
-		if any(seq != 0 for seq in k.seqId) and (0 in k.seqId):
-			k.seqId.remove(0)
-
-		#Set direction (it uses seq id)
-		k.setDirection()
-
-		#Set op
-		k.setOp()
-
-		#The following code is based on heuristics.
-		#TODO: Refactor.
-		#Assign subSeqId, adjust seqId and altSeqId
-		#seqId can be 0.
-		#A kernel can have multiple seqIds both in fprop and bprop.
-		#In bprop, seqIds might not decrease monotonically. I have observed a few blips.
-		if len(k.seqId):
-			assert (k.dir in ["fprop", "bprop"])
-			if (k.dir == "fprop"):
-				#Check if there is a sequence id larger than the previous
-				inc = (k.seqId[-1] > prevSeqId)
-				if inc:
-					currSeqId = [x for x in k.seqId if x > prevSeqId][0]
-				else:
-					currSeqId = prevSeqId
-			else:
-				currSeqId = k.seqId[0]
-
-			#if ((currSeqId == prevSeqId) and (k.op == prevOp)):
-			if ((currSeqId == prevSeqId) and (k.op == prevOp)) or ((k.op[0] == "forward") and (k.op == prevOp) and (k.mod[0] in ["LSTMCell", "GRUCell", "RNNCell"])):
-				#The second condition is to trap cases when pytorch does not use cudnn for a LSTMCell.
-				k.subSeqId = prevSubSeqId + 1
-
-			prevSeqId = currSeqId
-			prevSubSeqId = k.subSeqId
-			prevOp = k.op
-
-			#Keep currSeqId in k.seqId, move everything else to k.altSeqId
-			for s in k.seqId:
-				if s != currSeqId:
-					k.seqId.remove(s)
-					k.altSeqId.append(s)
-
-			for s in k.altSeqId:
-				if s == currSeqId:
-					k.altSeqId.remove(s)
-
-			k.altSeqId = list(set(k.altSeqId))
-			if (len(k.altSeqId)):
-				(k.altSeqId).sort()
-
-		k.print()
-
-	db.close()
-
-if __name__ == '__main__':
-	main()
--- a/apex/pyprof/prof/__init__.py
+++ b/apex/pyprof/prof/__init__.py
-from . import data, prof
--- a/apex/pyprof/prof/__main__.py
+++ b/apex/pyprof/prof/__main__.py
-import warnings
-
-try:
-    from .prof import main
-except ImportError as e:
-    warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?")
-    raise e
-
-if __name__ == '__main__':
-    main()
--- a/apex/pyprof/prof/activation.py
+++ b/apex/pyprof/prof/activation.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Activation(OperatorLayerBase):
-	"""
-	This class handles the various activation functions.
-	"""
-
-	ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"]
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod in ["torch.nn.functional", "torch", "Tensor"])
-
-		#Filter out named parameters
-		args = list(filter(lambda x : x['name'] == '', args))
-
-		assert (len(args) >= 1)
-		arg = args[0]
-		assert (arg['type'] == "tensor")
-
-		self.i = arg
-		self.dir = d.dir
-
-	def params(self):
-		p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])])
-		return p
-
-	def flops(self):
-		direction = self.dir
-		tensor = self.i['shape']
-		t = self.i['dtype']
-
-		# TODO: revise
-		elems = Utility.numElems(tensor)
-		return elems
-
-	def bytes(self):
-		direction = self.dir
-		tensor = self.i['shape']
-		t = self.i['dtype']
-
-		elems = Utility.numElems(tensor)
-		elems = elems * (2 if direction == "fprop" else 3)
-
-		return elems * Utility.typeToBytes(t)
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/base.py
+++ b/apex/pyprof/prof/base.py
-from abc import ABC, abstractmethod
-
-class OperatorLayerBase(ABC):
-	"""
-	Base class for all layers and operators.
-	Every derived class should have the following functions.
-	"""
-
-	@abstractmethod
-	def tc(self):
-		"""
-		Tensor core usage by the kernel.
-		Return "1" (yes), "0" (no, but possible), "-" (not applicable)
-		"""
-		pass
-
-	@abstractmethod
-	def params(self):
-		"""
-		Kernel parameters to be printed.
-		"""
-		pass
-
-	@abstractmethod
-	def flops(self):
-		"""
-		Note that 1 FMA = 2 flops.
-		"""
-		pass
-
-	@abstractmethod
-	def bytes(self):
-		pass
-
-	@abstractmethod
-	def mod(self):
-		"""
-		Name of the module/class e.g. torch.nn.functional.
-		"""
-		pass
-
-	@abstractmethod
-	def op(self):
-		"""
-		Name of the operator e.g. sigmoid.
-		"""
-		pass
--- a/apex/pyprof/prof/blas.py
+++ b/apex/pyprof/prof/blas.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-import numpy as np
-
-TC_GEMMS = ["884gemm", "1688gemm"]
-
-class Addmm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod in ["torch", "Tensor",])
-		assert (op in ["addmm", "addmm_",])
-
-		#Get alpha and beta
-		alpha = 1
-		beta = 1
-		if any(x['name'] == 'alpha' for x in args):
-			alpha = list(filter(lambda x : x['name'] == "alpha", args))[0]
-			alpha = alpha['value']
-
-		if any(x['name'] == 'beta' for x in args):
-			beta = list(filter(lambda x : x['name'] == "beta", args))[0]
-			beta = beta['value']
-
-		self.alpha = alpha
-		self.beta = beta
-
-		#Filter out named parameters
-		args = list(filter(lambda x : x['name'] == '', args))
-
-		assert (len(args) == 3)
-		C,A,B = args
-		m,k1 = A['shape']
-		k2,n = B['shape']
-		assert (k1 == k2)
-		t1 = A['dtype']
-		t2 = B['dtype']
-		t3 = C['dtype']
-		assert(t1 == t2 == t3)
-
-		self.A = A
-		self.B = B
-		self.C = C
-
-		self.m = m
-		self.n = n
-		self.k = k1
-		self.type = t1
-		self.name = d.name
-
-		return
-
-	def tc(self):
-            for s in TC_GEMMS:
-                if s in self.name:
-                    return 1
-            return 0
-
-	def bytes(self):
-		m, n, k = self.m, self.n, self.k
-		return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
-
-	def flops(self):
-		return self.m * self.n * self.k * 2
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def params(self):
-		p = OrderedDict([('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
-		return p
-
-class Bmm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch") and (op == "bmm")
-
-		#Filter out named params (kwargs)
-		args = list(filter(lambda x : x['name'] == "", args))
-
-		assert (len(args) == 2)
-		A,B = args
-		b1,m,k1 = A['shape']
-		b2,k2,n = B['shape']
-		assert (b1 == b2)
-		assert (k1 == k2)
-		t1 = A['dtype']
-		t2 = B['dtype']
-		assert(t1 == t2)
-
-		self.A = A
-		self.B = B
-		self.b = b1
-		self.m = m
-		self.n = n
-		self.k = k1
-		self.type = t1
-		self.name = d.name
-
-	def tc(self):
-            for s in TC_GEMMS:
-                if s in self.name:
-                    return 1
-            return 0
-
-	def params(self):
-		#p = OrderedDict([('A', A['shape']), ('B', B['shape']), ('type', t1)])
-		p = OrderedDict([('B',self.b), ('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
-		return p
-
-	def flops(self):
-		return self.b * self.m * self.n * self.k * 2
-
-	def bytes(self):
-		b, m, n, k = self.b, self.m, self.n, self.k
-		return Utility.typeToBytes(self.type) * b * (m*n + m*k + n*k)
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-class Matmul(OperatorLayerBase):
-
-	NON_GEMM = ["kernelPointwiseApply2", "reduce_1Block_kernel", "elementwise_kernel"]
-	NON_TC = NON_GEMM + ["dot_kernel"]
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		self.name = d.name
-		self.sub = d.sub
-
-		assert ((mod == "torch") and (op == "matmul")) or ((mod == "Tensor") and (op == "__matmul__"))
-		assert (len(args) == 2)
-
-		assert any([x in d.name for x in Matmul.NON_TC + ["gemm", "gemv"]])
-
-		A,B = args
-		t1 = A['dtype']
-		t2 = B['dtype']
-		assert(t1 == t2)
-
-		A = A['shape']
-		B = B['shape']
-
-		self.A = A
-		self.B = B
-		self.type = t1
-
-		# batch, MNK
-		if (len(A) == 1) and (len(B) == 1):
-			#dot product
-			assert (A[0] == B[0])
-			self.b = (1,)
-			self.m = 1
-			self.n = 1
-			self.k = A[0]
-
-		elif (len(A) == 2) and (len(B) == 2):
-			#gemm
-			m,k1 = A
-			k2,n = B
-			assert(k1 == k2)
-			self.b = (1,)
-			self.m = m
-			self.n = n
-			self.k = k1
-
-		elif (len(A) == 1) and (len(B) == 2):
-			#vector matrix
-			k1 = A[0]
-			k2,n = B
-			assert(k1 == k2)
-
-			self.b = (1,)
-			self.m = 1
-			self.n = n
-			self.k = k1
-
-		elif (len(A) == 2) and (len(B) == 1):
-			#gemv
-			m,k1 = A
-			k2 = B[0]
-			assert (k1 == k2)
-
-			self.b = (1,)
-			self.m = m
-			self.n = 1
-			self.k = k1
-
-		elif (len(A) == 1) and (len(B) > 2):
-			assert (A[0] == B[-2])
-
-			self.b = B[0:-2]
-			self.m = 1
-			self.n = B[-1]
-			self.k = B[-2]
-
-		elif (len(B) == 1) and (len(A) > 2):
-			assert (B[0] == A[-1])
-
-			self.b = A[0:-2]
-			self.m = A[-2]
-			self.n = 1
-			self.k = A[-1]
-
-		else:
-			assert (len(A) >= 2)
-			assert (len(B) >= 2)
-			assert (A[-1] == B[-2])
-			self.m = A[-2]
-			self.n = B[-1]
-			self.k = A[-1]
-
-			aa = np.empty(A[0:-2])
-			bb = np.empty(B[0:-2])
-			self.b = np.broadcast(aa, bb).shape
-
-	def params(self):
-		return OrderedDict([('A', self.A), ('B', self.B), ('type', self.type)])
-
-	def tc(self):
-		if self.name in Matmul.NON_TC:
-			return "-"
-		else:
-                    for s in TC_GEMMS:
-                        if s in self.name:
-                            return 1
-                    return 0
-
-	def bytes(self):
-		# TODO: check bytes for non-GEMM cases
-		if self.name in Matmul.NON_GEMM:
-			return 2 * Utility.typeToBytes(self.type) * Utility.numElems(self.A) #could be B as well
-		else:
-			m, n, k = self.m, self.n, self.k
-			return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
-
-	def flops(self):
-		# TODO: calculate actual FLOPs. At least we're not saying it's GEMM FLOPs for now.
-		if self.name in Matmul.NON_GEMM:
-			return 0
-		else:
-			return Utility.numElems(self.b) * self.m * self.n * self.k * 2
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-class Mm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch") and (op == "mm")
-		assert (len(args) == 2)
-
-		A,B = args
-		m,k1 = A['shape']
-		k2,n = B['shape']
-		assert (k1 == k2)
-		t1 = A['dtype']
-		t2 = B['dtype']
-		assert(t1 == t2)
-
-		self.A = A
-		self.B = B
-		self.m = m
-		self.n = n
-		self.k = k1
-		self.type = t1
-		self.name = d.name
-
-		return
-
-	def params(self):
-		p = OrderedDict([('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
-		return p
-
-	def tc(self):
-            for s in TC_GEMMS:
-                if s in self.name:
-                    return 1
-            return 0
-
-	def bytes(self):
-		m, n, k = self.m, self.n, self.k
-		return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
-
-	def flops(self):
-		return self.m * self.n * self.k * 2
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/conv.py
+++ b/apex/pyprof/prof/conv.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Conv(OperatorLayerBase):
-
-	"""
-	# N = batch size
-	# C,H,W = input channels, height, width
-	# K,P,Q = output channels, height, width
-	# R,S = filter height, width
-	# g = groups
-	"""
-
-	#todo: refine winograd and FFT
-	convAuxList = ["nchwToNhwc", "nhwcToNchw", "OffsetsKernel",]
-	winoAuxList = ["generateWinogradTilesKernel", "winogradWgradData", "winogradWgradOutput", "winogradWgradDelta"]
-	fftAuxList = ["compute_gemm_pointers", "flip_filter", "fft2d_r2c_", "fft2d_c2r_", "fft1d_r2c", "fft1d_c2r"]
-	miscAuxList = ["scaleTensor_kernel",]
-
-	convList = ["_s884cudnn_", "_s1688cudnn_", "_scudnn_", "2d_grouped_direct_kernel", "cudnn::detail::implicit_convolve_sgemm", "cudnn::detail::dgrad2d_alg1_1", "cudnn::detail::wgrad_alg0_engine", "cudnn::detail::dgrad_engine", "dgrad_1x1_stride_2x2", "spatialDepthwiseConvolutionUpdateOutput"]
-	winoList = ["winograd3x3Kernel", "_sgemm_"]
-	fftList = ["fermiPlusCgemmLDS128_batched", "_gcgemm_",]
-	miscList = []
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		self.dir = d.dir
-		self.name = d.name
-		self.sub = d.sub
-
-		assert (mod == "torch.nn.functional")
-		assert (op in ["conv1d", "conv2d"])
-		length = len(args)
-		assert (length >= 2) and (length <= 7)
-		i,w = args[0], args[1]
-		assert (i['type'] == "tensor")
-		assert (w['type'] == "tensor")
-
-		#ignore bias
-
-		if (length >= 4) and (args[3]['name'] == ""):
-			s = args[3]
-		elif any(x['name'] == 'stride' for x in args):
-			s = list(filter(lambda x : x['name'] == 'stride', args))[0]
-		else:
-			s = {'name': 'stride', 'type': 'int', 'value': 1}
-
-		if (length >= 5) and (args[4]['name'] == ""):
-			p = args[4]
-		elif any(x['name'] == 'padding' for x in args):
-			p = list(filter(lambda x : x['name'] == 'padding', args))[0]
-		else:
-			p = {'name': 'padding', 'type': 'int', 'value': 0}
-
-		if (length >= 6) and (args[5]['name'] == ""):
-			d = args[5]
-		elif any(x['name'] == 'dilation' for x in args):
-			d = list(filter(lambda x : x['name'] == 'dilation', args))[0]
-		else:
-			d = {'name': 'dilation', 'type': 'int', 'value': 1}
-
-		if (length == 7) and (args[6]['name'] == ""):
-			g = args[6]
-		elif any(x['name'] == 'groups' for x in args):
-			g = list(filter(lambda x : x['name'] == 'groups', args))[0]
-		else:
-			g = {'name': 'groups', 'type': 'int', 'value': 1}
-
-		if op == "conv1d":
-			assert (len(i['shape']) == 3)
-			assert (len(w['shape']) == 3)
-			assert (i['dtype'] == w['dtype'])
-			N, C1, W = i['shape']
-			K, C2, S = w['shape']
-			assert (C1 == C2)
-			p = p['value'] if Utility.isscalar(p['type']) else p['value'][0]
-			s = s['value'] if Utility.isscalar(s['type']) else s['value'][0]
-			d = d['value'] if Utility.isscalar(d['type']) else d['value'][0]
-			g = g['value']
-			assert (g == 1)
-			H = 1
-			R = 1
-
-			P = 1 + (H - (((R-1))+1))
-			Q = 1 + (W + 2*p - (((S-1)*d)+1))/s
-			P = int(P)
-			Q = int(Q)
-			if (H == 1):
-				assert (P == 1)
-			if (W == 1):
-				assert (Q == 1)
-
-			self.N = N
-			self.C = C1
-			self.H = H
-			self.W = W
-			self.K = K
-			self.P = P
-			self.Q = Q
-			self.R = R
-			self.S = S
-			self.ph = 0
-			self.pw = p
-			self.U = 1
-			self.V = s
-			self.dh = 1
-			self.dw = d
-			self.g = g
-			self.type = i['dtype']
-
-		elif op == "conv2d":
-			assert (len(i['shape']) == 4)
-			assert (len(w['shape']) == 4)
-			assert (i['dtype'] == w['dtype'])
-			N, C1, H, W = i['shape']
-			K, C2, R, S = w['shape']
-
-			if Utility.isscalar(p['type']):
-				ph = pw = p['value']
-			else:
-				assert (p['type'] == "tuple")
-				ph, pw = p['value']
-
-			if Utility.isscalar(s['type']):
-				sh = sw = s['value']
-			else:
-				assert (s['type'] == "tuple")
-				sh, sw = s['value']
-
-			if Utility.isscalar(d['type']):
-				dh = dw = d['value']
-			else:
-				assert (d['type'] == "tuple")
-				dh, dw = d['value']
-
-			g = g['value']
-			assert (g >= 1)
-			assert (C1 == C2*g)
-
-			P = 1 + (H + 2*ph - (((R-1)*dh)+1))/sh
-			Q = 1 + (W + 2*pw - (((S-1)*dw)+1))/sw
-			P = int(P)
-			Q = int(Q)
-			if (H == 1):
-				assert (P == 1)
-			if (W == 1):
-				assert (Q == 1)
-
-			self.N = N
-			self.C = C1
-			self.H = H
-			self.W = W
-			self.K = K
-			self.P = P
-			self.Q = Q
-			self.R = R
-			self.S = S
-			self.ph = ph
-			self.pw = pw
-			self.U = sh
-			self.V = sw
-			self.dh = dh
-			self.dw = dw
-			self.g = g
-			self.type = i['dtype']
-
-		else:
-			assert False
-
-	def params(self):
-		p = OrderedDict([('N',self.N), ('C',self.C), ('H',self.H), ('W',self.W), ('K',self.K), ('P',self.P), ('Q',self.Q), ('R',self.R), ('S',self.S), ('ph',self.ph), ('pw',self.pw), ('U',self.U), ('V',self.V), ('dh',self.dh), ('dw',self.dw), ('g',self.g), ('type',self.type)])
-		return p
-
-	def conv_bytes_flops(self, N, C, H, W, K, P, Q, R, S, g, t):
-		f = 2*N*K*P*Q*C*R*S/g #for fprop
-		elems = N*C*H*W + K*C*R*S/g + N*K*P*Q
-		b = elems * Utility.typeToBytes(t)
-		return b,f
-
-	def bytes_flops(self):
-		N,C,H,W,K,P,Q,R,S,ph,pw,U,V,dh,dw,g,t = self.params().values()
-
-		if any(x in self.name for x in Conv.convAuxList+Conv.winoAuxList+Conv.fftAuxList+Conv.miscAuxList):
-			bytes, flops = [0, 0]
-
-		elif any(x in self.name for x in Conv.convList+Conv.winoList+Conv.fftList+Conv.miscList):
-			if g == 1:
-				bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
-			else:
-				if "2d_grouped_direct_kernel" in self.name:	#only 1 kernel is called
-					bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
-				elif "spatialDepthwiseConvolutionUpdateOutput" in self.name: #one kernel for separable conv
-					bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
-				else:	#a kernel per group is called
-					bytes, flops = self.conv_bytes_flops(N,C/g,H,W,K/g,P,Q,R,S,1,t)
-
-		elif ("calc_bias_diff" in self.name):	#bias gradient
-			elems = N*K*P*Q
-			flops = elems
-			bytes = 2 * elems * Utility.typeToBytes(t)
-			#params = OrderedDict([('N',N), ('K',K), ('P',P), ('Q',Q), ('type', t)])
-
-		else:
-			bytes, flops = [0, 0]
-
-		return bytes, flops
-
-	def bytes(self):
-		b,_ = self.bytes_flops()
-		return b
-
-	def flops(self):
-		_,f = self.bytes_flops()
-		return f
-
-	def tc(self):
-		for s in ["884cudnn", "1688cudnn"]:
-			if s in self.name:
-				return 1
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/convert.py
+++ b/apex/pyprof/prof/convert.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Convert(OperatorLayerBase):
-	"""
-	Class to handle convert operations.
-	"""
-	ops = ["byte", "char", "double", "float", "half", "int", "long", "short", "to"]
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op in Convert.ops)
-		assert (len(args) == 1)
-
-		#The argument could be a tensor or scalar
-		t = args[0]
-		if t['type'] == "tensor":
-			shape = t['shape']
-			stype = t['dtype']
-		else:
-			shape = (1,)
-			stype = t['type']
-		if self.op_ == "to":
-			op = stype
-
-		self.shape = shape
-		self.stype = stype
-		self.dtype = op
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		b = self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
-		return b
--- a/apex/pyprof/prof/data.py
+++ b/apex/pyprof/prof/data.py
-from .utility import Utility
-
-class Data(object):
-	"""
-	Class to store all the data for every kernel e.g. name, bytes, flops, device, stream etc.
-	"""
-	def __init__(self, kernel):
-		#Available from NVprof
-		self.tid = kernel['tid']
-		self.device = kernel['device']
-		self.stream = kernel['stream']
-		self.grid = str(kernel['grid']).replace(" ","").replace("(","").replace(")","")
-		self.block = str(kernel['block']).replace(" ","").replace("(","").replace(")","")
-		self.name = kernel['kShortName'].replace(" ","_")
-		self.lName = kernel['kLongName']
-		self.sil = kernel['kDuration']	#units ns
-
-		self.index = None
-
-		#Markers
-		self.argMarker = kernel['marker']
-		self.modMarker = kernel['reprMarkers']
-		self.seqMarker = kernel['seqMarker']
-
-		self.layer = kernel['layer']
-		self.trace = kernel['trace']
-
-		self.seqId = kernel['seqId']
-		self.altSeqId = kernel['altSeqId']
-
-		self.dir = kernel['dir']
-		self.sub = kernel['subSeqId']
-
-		self.mod = "na"
-		self.op = "na"
-		self.params = {"na":"na"}
-		self.tc = "na"
-		self.flops = 0
-		self.bytes = 0
-
-	def setParams(self, params):
-		#Remove space from params
-		qaz = ""
-		for key,value in params.items():
-			if "type" not in key:
-				qaz += "{}={},".format(key,value)
-			else:
-				if type(value) is str:
-					qaz += "{},".format(Utility.typeToString(value))
-				else:
-					qaz += "{}".format(value)
-
-		self.params = qaz.replace(" ", "")
-
--- a/apex/pyprof/prof/dropout.py
+++ b/apex/pyprof/prof/dropout.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Dropout(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "dropout")
-		#assert (len(args) == 1)
-
-		self.shape = args[0]['shape']
-		self.type  = args[0]['dtype']
-		self.dir = d.dir
-
-		return
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def bytes(self):
-		#Ignoring the cost of writing and reading the mask
-		return Utility.typeToBytes(self.type) * self.elems() * 2
-
-	def flops(self):
-		# Note: This is approximate and depends on the RNG
-		return 5*self.elems()
--- a/apex/pyprof/prof/embedding.py
+++ b/apex/pyprof/prof/embedding.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Embedding(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "embedding")
-
-		self.ishape = args[0]['shape']
-		self.itype = args[0]['dtype']
-
-		self.eshape = args[1]['shape']
-		self.etype = args[1]['dtype']
-
-		assert (len(self.eshape) == 2)
-
-		self.dir = d.dir
-		self.sub = d.sub
-		return
-
-	def params(self):
-		p = OrderedDict([('I', self.ishape), ('itype', self.itype), ('E', self.eshape), ('etype', self.etype)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def bytes(self):
-		ishape = self.ishape
-		itype = self.itype
-		eshape = self.eshape
-		etype = self.etype
-
-		ielems = Utility.numElems(ishape)
-
-		b = 0
-		if self.dir == "fprop":
-			#indices
-			b += ielems * Utility.typeToBytes(itype)
-			#read and write the embedding matrix
-			b += ielems * eshape[1] * 2 * Utility.typeToBytes(etype)
-		else:
-			#3 times the size of the incoming gradient
-			b = ielems * eshape[1] * 3 * Utility.typeToBytes(etype)
-
-			if self.sub > 0:
-				b = 0
-
-		return b
-
-	def flops(self):
-		# Note: not implemented yet
-		return 0
--- a/apex/pyprof/prof/index_slice_join_mutate.py
+++ b/apex/pyprof/prof/index_slice_join_mutate.py
-from collections import OrderedDict
-from .utility import Utility
-import numpy as np
-from .base import OperatorLayerBase
-
-class Cat(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch")
-		assert (op == "cat")
-		assert (len(args) >= 2)
-
-		t = args[0]['dtype']
-		shapes = []
-
-		for arg in args:
-			if arg['type'] == "tensor":
-				assert (arg['dtype'] == t)
-				shapes.append(arg['shape'])
-
-		self.type = t
-		self.shapes = shapes
-
-	def params(self):
-		p = OrderedDict([('T', self.shapes), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		b = 0
-		for s in self.shapes:
-			b += Utility.numElems(s)
-		return 2 * b * Utility.typeToBytes(self.type)
-
-class Reshape(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "reshape")
-
-		#Temporarily commenting three lines
-		#assert (len(args) == 2)
-		#t,s = args
-		#assert s['type'] == "tuple"
-
-		t = args[0]
-		assert t['type'] == "tensor"
-		self.type = t['dtype']
-		self.shape = t['shape']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		return 0
-
-class Gather(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor") or (mod == "torch")
-		assert (op == "gather")
-
-		#Filter out the "out" parameter
-		args = list(filter(lambda x : x['name'] != 'out', args))
-		assert (len(args) == 3)
-
-		#Get input
-		if (args[0]['name'] == ""):
-			arg = args[0]
-		else:
-			arg = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		assert (arg['type'] == "tensor")
-
-		self.shape = arg['shape']
-		self.type = arg['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		return 2 * Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
-
-class MaskedScatter(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "masked_scatter_")
-		assert (len(args) == 3)
-
-		dst, mask, src = args
-		assert (dst['type'] == mask['type'] == src['type'] == "tensor")
-		assert (mask['dtype'] == "uint8")
-		assert (dst['dtype'] == src['dtype'])
-		assert (dst['shape'] == mask['shape'])
-
-		self.shape = dst['shape']
-		self.type = dst['dtype']
-		self.seqId = d.seqId
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		elems = Utility.numElems(self.shape)
-
-		#src and dst
-		b = 2 * elems * Utility.typeToBytes(self.type)
-
-		#mask (uint8)
-		b += elems
-
-		if (self.seqId > 0):
-			b = 0
-		return b
-
-class Nonzero(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod in ["torch", "Tensor"])
-		assert (op == "nonzero")
-		assert (len(args) == 1)
-
-		arg = args[0]
-		self.shape = arg['shape']
-		self.type = arg['dtype']
-		self.seqId = d.seqId
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		elems = Utility.numElems(self.shape)
-		dim = len(self.shape)
-
-		#input tensor
-		b = elems * Utility.typeToBytes(self.type)
-
-		#in the worst case, the output is a (elems x dim) tensor of type "long"
-		b += elems * dim * Utility.typeToBytes("int64")
-
-		if self.seqId > 0:
-			return 0
-		else:
-			return b
-
-class IndexSelect(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor") or (mod == "torch")
-		assert (op == "index_select")
-
-		#Filter out the "out" parameter
-		args = list(filter(lambda x : x['name'] != 'out', args))
-		assert (len(args) == 3)
-
-		#Get input, dim and index
-		if (args[0]['name'] == ""):
-			t = args[0]
-		else:
-			t = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		if (args[1]['name'] == ""):
-			d = args[1]
-		else:
-			d = list(filter(lambda x : x['name'] == "dim", args))[0]
-
-		if (args[2]['name'] == ""):
-			i = args[2]
-		else:
-			i = list(filter(lambda x : x['name'] == "index", args))[0]
-
-		assert (t['type'] == i['type'] == "tensor")
-		assert (d['type'] == "int")
-		assert (i['dtype'] == "int64")
-		assert (len(i['shape']) == 1)
-
-		shape = t['shape']
-		dim = d['value']
-		indices = i['shape'][0]
-		assert (dim < len(shape))
-
-		self.shape = shape
-		self.dim = dim
-		self.indices = indices
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('D', self.dim),('I', self.indices),('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		#determine the shape of the output tensor
-		shape = list(self.shape)
-		shape[self.dim] = self.indices
-
-		b = 0
-
-		#time to read the input and write the output
-		elems = Utility.numElems(shape)
-		b += 2 * elems * Utility.typeToBytes(self.type)
-
-		#time to read the indices
-		b += self.indices * Utility.typeToBytes("int64")
-
-		return b
-
-class MaskedSelect(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-		self.sub = d.sub
-
-		assert (mod == "Tensor") or (mod == "torch")
-		assert (op == "masked_select")
-
-		#Filter out the "out" parameter
-		args = list(filter(lambda x : x['name'] != 'out', args))
-		assert (len(args) == 2)
-
-		#Get input and mask
-		if (args[0]['name'] == ""):
-			t = args[0]
-		else:
-			t = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		if (args[1]['name'] == ""):
-			m = args[1]
-		else:
-			m = list(filter(lambda x : x['name'] == "mask", args))[0]
-
-		assert (m['dtype'] == "uint8")
-
-		tensor = t['shape']
-		mask = m['shape']
-
-		#check for broadcast condition
-		if (tensor != mask):
-			array1 = np.empty(list(tensor))
-			array2 = np.empty(list(mask))
-			try:
-				out = np.broadcast(array1, array2).shape
-			except:
-				assert False
-
-		self.tshape = tensor
-		self.mshape = mask
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.tshape),('M', self.mshape),('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		tensor = self.tshape
-		mask = self.mshape
-		t = self.type
-
-		#in the worst case, #output elements = #input elements
-		b = 2 * Utility.numElems(tensor) * Utility.typeToBytes(t)
-
-		#mask tensor (assuming uint8)
-		b += Utility.numElems(mask)
-		return b
-
-	def flops(self):
-		return 0
--- a/apex/pyprof/prof/linear.py
+++ b/apex/pyprof/prof/linear.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Linear(OperatorLayerBase):
-
-	'''
-	Notes:
-	If the bias occurs before the GEMM, then its 1 write (bias expansion).
-	If the bias occurs after, then its 1 read and 1 write.
-	bias in bprop is a reduction and hence is 1 read.
-	'''
-
-	gemmKernels = ["gemm", "gemv", "dot_kernel", "splitKreduce_kernel", "reduce_1Block_kernel"]
-	biasKernels = ["kernelReduceContigDim", "kernelReduceNoncontigDim_shared", "elementwise_kernel", "reduce_kernel"]
-
-	def setXWBMNK(self, args):
-		x = None
-		w = None
-		b = None
-		if (len(args) == 2):
-			x,w = args
-		elif (len(args) == 3):
-			x,w,b = args
-			assert (x['type'] == w['type'] == "tensor")
-			if (b['type'] == "tensor"):
-				assert(len(b['shape']) == 1)
-			elif (b['type'] == "NoneType"):
-				assert b['value'] is None
-				b = None
-			else:
-				assert False
-		else:
-			assert False
-
-		assert(len(w['shape']) == 2)
-		k1 = x['shape'][-1]
-		n,k2 = w['shape']
-		assert(k1 == k2)
-		if b is not None:
-			assert(b['shape'][0] == n)
-		t1 = x['dtype']
-		t2 = w['dtype']
-		assert(t1 == t2)
-
-		# X, W, B
-		self.x = x['shape']
-		self.w = w['shape']
-		self.b = b['shape'] if b is not None else None
-		self.type = t1
-
-		# M, N, K
-		#n = Utility.numElems(x[0:-1])
-		n = self.x[0:-1]
-		k = self.x[-1]
-		m,k1 = self.w
-		assert (k == k1)
-
-		self.m = m
-		self.n = n
-		self.k = k
-
-	def tc(self):
-		if self.op() == "linear":
-			return 1 if "884gemm" in self.name else 0
-		else:
-			return "-"
-
-	def __init__(self, d):
-		self.name = d.name
-		self.dir = d.dir
-		self.sub = d.sub
-
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "linear")
-
-		self.setXWBMNK(args)
-
-		if any(x in d.name for x in Linear.gemmKernels):
-			self.op_ = "linear"
-		else:
-			assert (d.name in Linear.biasKernels)
-			self.op_ = "bias"
-
-		'''
-		elif (("kernelPointwiseApply2" in d.name) or ("kernelReduceContigDim" in d.name) or ("kernelReduceNoncontigDim_shared" in d.name)):
-			#bias expansion was before the gemm
-			self.op_ = "bias"
-
-		elif ("elementwise_kernel" in d.name):
-			#Bias addition happens later with a broadcast tensor
-			self.op_ = "bias"
-			assert (len(d.argMarker) == 2)
-			marker = eval(d.argMarker[1])
-			mod = marker['mod']
-			op = marker['op']
-			args = marker['args']
-
-			assert (mod == "Tensor")
-			assert (op == "__iadd__")
-			assert (len(args) == 2)
-			mn = args[0]['shape']
-			b = args[1]['shape']
-			assert (len(b) == 1)
-
-			assert (mn == (self.n + (self.m,)))
-			assert (b == self.b)
-
-		else:
-			assert False
-		'''
-
-	def params(self):
-		#p = OrderedDict([('X', self.x), ('W', self.w), ('B', self.b), ('type', self.type)])
-
-		m, n, k, x, w, t = self.m, self.n, self.k, self.x, self.w, self.type
-		if len(n) == 1:
-			n = n[0]
-
-		if self.op_ == "linear":
-			if self.dir == "fprop":
-				p = OrderedDict([('M', m), ('N', n), ('K', k), ('type', t)])
-			elif self.dir == "bprop":
-				if self.sub == 0:		#dgrad (most likely)
-					p = OrderedDict([('M', k), ('N', n), ('K', m), ('type', t)])
-				elif self.sub == 1:	#wgrad (most likely)
-					p = OrderedDict([('M', k), ('N', m), ('K', n), ('type', t)])
-				else:
-					#This happens when there are additional kernels for reduction
-					p = OrderedDict([('X', x), ('W', w), ('type', t)])
-			else:
-				assert False
-
-		elif self.op_ == "bias":
-			p = OrderedDict([('M', m), ('N', n), ('type', t)])
-		else:
-			assert False
-		return p
-
-	def op(self):
-		return self.op_
-
-	def bytesFlops(self):
-
-		m = self.m
-		n = Utility.numElems(self.n)
-		k = self.k
-
-		if self.op_ == "linear":
-			if self.dir == "fprop":
-				f = m * n * k * 2
-				b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
-			elif self.dir == "bprop":
-				if self.sub == 0:		#dgrad (most likely)
-					f = m * n * k * 2
-					b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
-				elif self.sub == 1:	#wgrad (most likely)
-					f = m * n * k * 2
-					b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
-				else:
-					#This happens when there are additional kernels for reduction
-					f = 0
-					b = 0
-			else:
-				assert False
-
-		elif self.op_ == "bias":
-			f = m * n
-			b = 2 * m * n * Utility.typeToBytes(self.type)
-		else:
-			assert False
-		return b,f
-
-	def bytes(self):
-		b, f = self.bytesFlops()
-		return b
-
-	def flops(self):
-		b, f = self.bytesFlops()
-		return f
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/loss.py
+++ b/apex/pyprof/prof/loss.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-#TODO: Add support for additional loss functions.
-
-class MSELoss(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "mse_loss")
-		assert (len(args) == 3)
-
-		#Get input, target and reduction
-		if (args[0]['name'] == ""):
-			x = args[0]
-		else:
-			x = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		if (args[1]['name'] == ""):
-			y = args[1]
-		else:
-			y = list(filter(lambda x : x['name'] == "target", args))[0]
-
-		if (args[2]['name'] == ""):
-			r = args[2]
-		else:
-			r = list(filter(lambda x : x['name'] == "reduction", args))[0]
-
-		assert (x['type'] == y['type'] == "tensor")
-		assert (x['shape'] == y['shape'])
-		assert (x['dtype'] == y['dtype'])
-		assert (r['type'] == "str")
-		assert (r['value'] in ["none", "mean", "sum"])
-
-		self.shape = x['shape']
-		self.type = x['dtype']
-		self.red = r['value']
-		self.dir = d.dir
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type), ('red', self.red)])
-		return p
-
-	def elems(self):
-		red = self.red
-		e = Utility.numElems(self.shape)
-
-		if self.dir == "fprop":
-			if red == "none":
-				e *= 3
-			else:
-				e *= 2
-		else:
-			if red == "none":
-				e *= 4
-			else:
-				e *= 3
-		return e
-
-	def bytes(self):
-		return self.elems() * Utility.typeToBytes(self.type)
-
-	def flops(self):
-		return self.elems() * 2 + 1
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/misc.py
+++ b/apex/pyprof/prof/misc.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Foo(OperatorLayerBase):
-	"""
-	An object of Foo is instantiated when we detect an unsupported operator.
-	"""
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		shapes = []
-		types = []
-
-		for arg in args:
-			if arg['type'] == "tensor":
-				shapes.append(arg['shape'])
-				types.append(arg['dtype'])
-
-		self.shape = shapes
-		self.type = types
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		return 0
-
-class Copy(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "copy_")
-		assert (len(args) == 2)
-
-		dst, src = args
-		assert (src['type'] == dst['type'])
-		assert (src['shape'] == dst['shape'])
-
-		self.shape = src['shape']
-		self.stype = src['dtype']
-		self.dtype = dst['dtype']
-
-	def params(self):
-		#The data type might be different
-		p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def flops(self):
-		return 0
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def bytes(self):
-		return self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
-
-class Clone(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "clone")
-		assert (len(args) == 1)
-		t = args[0]
-		self.shape = t['shape']
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def bytes(self):
-		return 2 * self.elems() * Utility.typeToBytes(self.type)
-
-class Contiguous(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "contiguous")
-		assert (len(args) == 1)
-		t = args[0]
-		self.shape = t['shape']
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		return 2 * Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-class Any(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "any")
-		assert (len(args) == 1)	#could be 2 as well, the second argument is a bool
-		t = args[0]
-
-		self.shape = t['shape']
-		self.type = t['dtype']
-		self.sub = d.sub
-		return
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		return Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
--- a/apex/pyprof/prof/normalization.py
+++ b/apex/pyprof/prof/normalization.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class BatchNorm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (op == "batch_norm")
-		assert (len(args) == 8)
-		i = args[0]
-		assert (i['type'] == "tensor")
-
-		self.shape = i['shape']
-		self.type = i['dtype']
-		self.dir = d.dir
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def flops(self):
-		# Variance algo-dependent, but this is a reasonable value.
-		return self.elems() * 8
-
-	def bytes(self):
-		e = self.elems()
-		if self.dir == "fprop":
-			e *= 4
-		else:
-			e *= 5
-
-		return e * Utility.typeToBytes(self.type)