Remove `pyprof` and `reparameterization` (#1404)

* remove pyprof * remove reparameterization * remove pyprof test * clean up

Remove `pyprof` and `reparameterization` (#1404)
* remove pyprof * remove reparameterization * remove pyprof test * clean up
8a7a3325 · Masaki Kozuki · GitHub · cd499737 · cd499737 · cd499737
Unverified Commit 8a7a3325 authored Jul 06, 2022 by Masaki Kozuki Committed by GitHub Jul 06, 2022
20 changed files
--- a/apex/pyprof/prof/blas.py
+++ b/apex/pyprof/prof/blas.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-import numpy as np
-
-TC_GEMMS = ["884gemm", "1688gemm"]
-
-class Addmm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod in ["torch", "Tensor",])
-		assert (op in ["addmm", "addmm_",])
-
-		#Get alpha and beta
-		alpha = 1
-		beta = 1
-		if any(x['name'] == 'alpha' for x in args):
-			alpha = list(filter(lambda x : x['name'] == "alpha", args))[0]
-			alpha = alpha['value']
-
-		if any(x['name'] == 'beta' for x in args):
-			beta = list(filter(lambda x : x['name'] == "beta", args))[0]
-			beta = beta['value']
-
-		self.alpha = alpha
-		self.beta = beta
-
-		#Filter out named parameters
-		args = list(filter(lambda x : x['name'] == '', args))
-
-		assert (len(args) == 3)
-		C,A,B = args
-		m,k1 = A['shape']
-		k2,n = B['shape']
-		assert (k1 == k2)
-		t1 = A['dtype']
-		t2 = B['dtype']
-		t3 = C['dtype']
-		assert(t1 == t2 == t3)
-
-		self.A = A
-		self.B = B
-		self.C = C
-
-		self.m = m
-		self.n = n
-		self.k = k1
-		self.type = t1
-		self.name = d.name
-
-		return
-
-	def tc(self):
-            for s in TC_GEMMS:
-                if s in self.name:
-                    return 1
-            return 0
-
-	def bytes(self):
-		m, n, k = self.m, self.n, self.k
-		return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
-
-	def flops(self):
-		return self.m * self.n * self.k * 2
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def params(self):
-		p = OrderedDict([('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
-		return p
-
-class Bmm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch") and (op == "bmm")
-
-		#Filter out named params (kwargs)
-		args = list(filter(lambda x : x['name'] == "", args))
-
-		assert (len(args) == 2)
-		A,B = args
-		b1,m,k1 = A['shape']
-		b2,k2,n = B['shape']
-		assert (b1 == b2)
-		assert (k1 == k2)
-		t1 = A['dtype']
-		t2 = B['dtype']
-		assert(t1 == t2)
-
-		self.A = A
-		self.B = B
-		self.b = b1
-		self.m = m
-		self.n = n
-		self.k = k1
-		self.type = t1
-		self.name = d.name
-
-	def tc(self):
-            for s in TC_GEMMS:
-                if s in self.name:
-                    return 1
-            return 0
-
-	def params(self):
-		#p = OrderedDict([('A', A['shape']), ('B', B['shape']), ('type', t1)])
-		p = OrderedDict([('B',self.b), ('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
-		return p
-
-	def flops(self):
-		return self.b * self.m * self.n * self.k * 2
-
-	def bytes(self):
-		b, m, n, k = self.b, self.m, self.n, self.k
-		return Utility.typeToBytes(self.type) * b * (m*n + m*k + n*k)
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-class Matmul(OperatorLayerBase):
-
-	NON_GEMM = ["kernelPointwiseApply2", "reduce_1Block_kernel", "elementwise_kernel"]
-	NON_TC = NON_GEMM + ["dot_kernel"]
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		self.name = d.name
-		self.sub = d.sub
-
-		assert ((mod == "torch") and (op == "matmul")) or ((mod == "Tensor") and (op == "__matmul__"))
-		assert (len(args) == 2)
-
-		assert any([x in d.name for x in Matmul.NON_TC + ["gemm", "gemv"]])
-
-		A,B = args
-		t1 = A['dtype']
-		t2 = B['dtype']
-		assert(t1 == t2)
-
-		A = A['shape']
-		B = B['shape']
-
-		self.A = A
-		self.B = B
-		self.type = t1
-
-		# batch, MNK
-		if (len(A) == 1) and (len(B) == 1):
-			#dot product
-			assert (A[0] == B[0])
-			self.b = (1,)
-			self.m = 1
-			self.n = 1
-			self.k = A[0]
-
-		elif (len(A) == 2) and (len(B) == 2):
-			#gemm
-			m,k1 = A
-			k2,n = B
-			assert(k1 == k2)
-			self.b = (1,)
-			self.m = m
-			self.n = n
-			self.k = k1
-
-		elif (len(A) == 1) and (len(B) == 2):
-			#vector matrix
-			k1 = A[0]
-			k2,n = B
-			assert(k1 == k2)
-
-			self.b = (1,)
-			self.m = 1
-			self.n = n
-			self.k = k1
-
-		elif (len(A) == 2) and (len(B) == 1):
-			#gemv
-			m,k1 = A
-			k2 = B[0]
-			assert (k1 == k2)
-
-			self.b = (1,)
-			self.m = m
-			self.n = 1
-			self.k = k1
-
-		elif (len(A) == 1) and (len(B) > 2):
-			assert (A[0] == B[-2])
-
-			self.b = B[0:-2]
-			self.m = 1
-			self.n = B[-1]
-			self.k = B[-2]
-
-		elif (len(B) == 1) and (len(A) > 2):
-			assert (B[0] == A[-1])
-
-			self.b = A[0:-2]
-			self.m = A[-2]
-			self.n = 1
-			self.k = A[-1]
-
-		else:
-			assert (len(A) >= 2)
-			assert (len(B) >= 2)
-			assert (A[-1] == B[-2])
-			self.m = A[-2]
-			self.n = B[-1]
-			self.k = A[-1]
-
-			aa = np.empty(A[0:-2])
-			bb = np.empty(B[0:-2])
-			self.b = np.broadcast(aa, bb).shape
-
-	def params(self):
-		return OrderedDict([('A', self.A), ('B', self.B), ('type', self.type)])
-
-	def tc(self):
-		if self.name in Matmul.NON_TC:
-			return "-"
-		else:
-                    for s in TC_GEMMS:
-                        if s in self.name:
-                            return 1
-                    return 0
-
-	def bytes(self):
-		# TODO: check bytes for non-GEMM cases
-		if self.name in Matmul.NON_GEMM:
-			return 2 * Utility.typeToBytes(self.type) * Utility.numElems(self.A) #could be B as well
-		else:
-			m, n, k = self.m, self.n, self.k
-			return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
-
-	def flops(self):
-		# TODO: calculate actual FLOPs. At least we're not saying it's GEMM FLOPs for now.
-		if self.name in Matmul.NON_GEMM:
-			return 0
-		else:
-			return Utility.numElems(self.b) * self.m * self.n * self.k * 2
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-class Mm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch") and (op == "mm")
-		assert (len(args) == 2)
-
-		A,B = args
-		m,k1 = A['shape']
-		k2,n = B['shape']
-		assert (k1 == k2)
-		t1 = A['dtype']
-		t2 = B['dtype']
-		assert(t1 == t2)
-
-		self.A = A
-		self.B = B
-		self.m = m
-		self.n = n
-		self.k = k1
-		self.type = t1
-		self.name = d.name
-
-		return
-
-	def params(self):
-		p = OrderedDict([('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
-		return p
-
-	def tc(self):
-            for s in TC_GEMMS:
-                if s in self.name:
-                    return 1
-            return 0
-
-	def bytes(self):
-		m, n, k = self.m, self.n, self.k
-		return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
-
-	def flops(self):
-		return self.m * self.n * self.k * 2
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/conv.py
+++ b/apex/pyprof/prof/conv.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Conv(OperatorLayerBase):
-
-	"""
-	# N = batch size
-	# C,H,W = input channels, height, width
-	# K,P,Q = output channels, height, width
-	# R,S = filter height, width
-	# g = groups
-	"""
-
-	#todo: refine winograd and FFT
-	convAuxList = ["nchwToNhwc", "nhwcToNchw", "OffsetsKernel",]
-	winoAuxList = ["generateWinogradTilesKernel", "winogradWgradData", "winogradWgradOutput", "winogradWgradDelta"]
-	fftAuxList = ["compute_gemm_pointers", "flip_filter", "fft2d_r2c_", "fft2d_c2r_", "fft1d_r2c", "fft1d_c2r"]
-	miscAuxList = ["scaleTensor_kernel",]
-
-	convList = ["_s884cudnn_", "_s1688cudnn_", "_scudnn_", "2d_grouped_direct_kernel", "cudnn::detail::implicit_convolve_sgemm", "cudnn::detail::dgrad2d_alg1_1", "cudnn::detail::wgrad_alg0_engine", "cudnn::detail::dgrad_engine", "dgrad_1x1_stride_2x2", "spatialDepthwiseConvolutionUpdateOutput"]
-	winoList = ["winograd3x3Kernel", "_sgemm_"]
-	fftList = ["fermiPlusCgemmLDS128_batched", "_gcgemm_",]
-	miscList = []
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		self.dir = d.dir
-		self.name = d.name
-		self.sub = d.sub
-
-		assert (mod == "torch.nn.functional")
-		assert (op in ["conv1d", "conv2d"])
-		length = len(args)
-		assert (length >= 2) and (length <= 7)
-		i,w = args[0], args[1]
-		assert (i['type'] == "tensor")
-		assert (w['type'] == "tensor")
-
-		#ignore bias
-
-		if (length >= 4) and (args[3]['name'] == ""):
-			s = args[3]
-		elif any(x['name'] == 'stride' for x in args):
-			s = list(filter(lambda x : x['name'] == 'stride', args))[0]
-		else:
-			s = {'name': 'stride', 'type': 'int', 'value': 1}
-
-		if (length >= 5) and (args[4]['name'] == ""):
-			p = args[4]
-		elif any(x['name'] == 'padding' for x in args):
-			p = list(filter(lambda x : x['name'] == 'padding', args))[0]
-		else:
-			p = {'name': 'padding', 'type': 'int', 'value': 0}
-
-		if (length >= 6) and (args[5]['name'] == ""):
-			d = args[5]
-		elif any(x['name'] == 'dilation' for x in args):
-			d = list(filter(lambda x : x['name'] == 'dilation', args))[0]
-		else:
-			d = {'name': 'dilation', 'type': 'int', 'value': 1}
-
-		if (length == 7) and (args[6]['name'] == ""):
-			g = args[6]
-		elif any(x['name'] == 'groups' for x in args):
-			g = list(filter(lambda x : x['name'] == 'groups', args))[0]
-		else:
-			g = {'name': 'groups', 'type': 'int', 'value': 1}
-
-		if op == "conv1d":
-			assert (len(i['shape']) == 3)
-			assert (len(w['shape']) == 3)
-			assert (i['dtype'] == w['dtype'])
-			N, C1, W = i['shape']
-			K, C2, S = w['shape']
-			assert (C1 == C2)
-			p = p['value'] if Utility.isscalar(p['type']) else p['value'][0]
-			s = s['value'] if Utility.isscalar(s['type']) else s['value'][0]
-			d = d['value'] if Utility.isscalar(d['type']) else d['value'][0]
-			g = g['value']
-			assert (g == 1)
-			H = 1
-			R = 1
-
-			P = 1 + (H - (((R-1))+1))
-			Q = 1 + (W + 2*p - (((S-1)*d)+1))/s
-			P = int(P)
-			Q = int(Q)
-			if (H == 1):
-				assert (P == 1)
-			if (W == 1):
-				assert (Q == 1)
-
-			self.N = N
-			self.C = C1
-			self.H = H
-			self.W = W
-			self.K = K
-			self.P = P
-			self.Q = Q
-			self.R = R
-			self.S = S
-			self.ph = 0
-			self.pw = p
-			self.U = 1
-			self.V = s
-			self.dh = 1
-			self.dw = d
-			self.g = g
-			self.type = i['dtype']
-
-		elif op == "conv2d":
-			assert (len(i['shape']) == 4)
-			assert (len(w['shape']) == 4)
-			assert (i['dtype'] == w['dtype'])
-			N, C1, H, W = i['shape']
-			K, C2, R, S = w['shape']
-
-			if Utility.isscalar(p['type']):
-				ph = pw = p['value']
-			else:
-				assert (p['type'] == "tuple")
-				ph, pw = p['value']
-
-			if Utility.isscalar(s['type']):
-				sh = sw = s['value']
-			else:
-				assert (s['type'] == "tuple")
-				sh, sw = s['value']
-
-			if Utility.isscalar(d['type']):
-				dh = dw = d['value']
-			else:
-				assert (d['type'] == "tuple")
-				dh, dw = d['value']
-
-			g = g['value']
-			assert (g >= 1)
-			assert (C1 == C2*g)
-
-			P = 1 + (H + 2*ph - (((R-1)*dh)+1))/sh
-			Q = 1 + (W + 2*pw - (((S-1)*dw)+1))/sw
-			P = int(P)
-			Q = int(Q)
-			if (H == 1):
-				assert (P == 1)
-			if (W == 1):
-				assert (Q == 1)
-
-			self.N = N
-			self.C = C1
-			self.H = H
-			self.W = W
-			self.K = K
-			self.P = P
-			self.Q = Q
-			self.R = R
-			self.S = S
-			self.ph = ph
-			self.pw = pw
-			self.U = sh
-			self.V = sw
-			self.dh = dh
-			self.dw = dw
-			self.g = g
-			self.type = i['dtype']
-
-		else:
-			assert False
-
-	def params(self):
-		p = OrderedDict([('N',self.N), ('C',self.C), ('H',self.H), ('W',self.W), ('K',self.K), ('P',self.P), ('Q',self.Q), ('R',self.R), ('S',self.S), ('ph',self.ph), ('pw',self.pw), ('U',self.U), ('V',self.V), ('dh',self.dh), ('dw',self.dw), ('g',self.g), ('type',self.type)])
-		return p
-
-	def conv_bytes_flops(self, N, C, H, W, K, P, Q, R, S, g, t):
-		f = 2*N*K*P*Q*C*R*S/g #for fprop
-		elems = N*C*H*W + K*C*R*S/g + N*K*P*Q
-		b = elems * Utility.typeToBytes(t)
-		return b,f
-
-	def bytes_flops(self):
-		N,C,H,W,K,P,Q,R,S,ph,pw,U,V,dh,dw,g,t = self.params().values()
-
-		if any(x in self.name for x in Conv.convAuxList+Conv.winoAuxList+Conv.fftAuxList+Conv.miscAuxList):
-			bytes, flops = [0, 0]
-
-		elif any(x in self.name for x in Conv.convList+Conv.winoList+Conv.fftList+Conv.miscList):
-			if g == 1:
-				bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
-			else:
-				if "2d_grouped_direct_kernel" in self.name:	#only 1 kernel is called
-					bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
-				elif "spatialDepthwiseConvolutionUpdateOutput" in self.name: #one kernel for separable conv
-					bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
-				else:	#a kernel per group is called
-					bytes, flops = self.conv_bytes_flops(N,C/g,H,W,K/g,P,Q,R,S,1,t)
-
-		elif ("calc_bias_diff" in self.name):	#bias gradient
-			elems = N*K*P*Q
-			flops = elems
-			bytes = 2 * elems * Utility.typeToBytes(t)
-			#params = OrderedDict([('N',N), ('K',K), ('P',P), ('Q',Q), ('type', t)])
-
-		else:
-			bytes, flops = [0, 0]
-
-		return bytes, flops
-
-	def bytes(self):
-		b,_ = self.bytes_flops()
-		return b
-
-	def flops(self):
-		_,f = self.bytes_flops()
-		return f
-
-	def tc(self):
-		for s in ["884cudnn", "1688cudnn"]:
-			if s in self.name:
-				return 1
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/convert.py
+++ b/apex/pyprof/prof/convert.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Convert(OperatorLayerBase):
-	"""
-	Class to handle convert operations.
-	"""
-	ops = ["byte", "char", "double", "float", "half", "int", "long", "short", "to"]
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op in Convert.ops)
-		assert (len(args) == 1)
-
-		#The argument could be a tensor or scalar
-		t = args[0]
-		if t['type'] == "tensor":
-			shape = t['shape']
-			stype = t['dtype']
-		else:
-			shape = (1,)
-			stype = t['type']
-		if self.op_ == "to":
-			op = stype
-
-		self.shape = shape
-		self.stype = stype
-		self.dtype = op
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		b = self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
-		return b
--- a/apex/pyprof/prof/data.py
+++ b/apex/pyprof/prof/data.py
-from .utility import Utility
-
-class Data(object):
-	"""
-	Class to store all the data for every kernel e.g. name, bytes, flops, device, stream etc.
-	"""
-	def __init__(self, kernel):
-		#Available from NVprof
-		self.tid = kernel['tid']
-		self.device = kernel['device']
-		self.stream = kernel['stream']
-		self.grid = str(kernel['grid']).replace(" ","").replace("(","").replace(")","")
-		self.block = str(kernel['block']).replace(" ","").replace("(","").replace(")","")
-		self.name = kernel['kShortName'].replace(" ","_")
-		self.lName = kernel['kLongName']
-		self.sil = kernel['kDuration']	#units ns
-
-		self.index = None
-
-		#Markers
-		self.argMarker = kernel['marker']
-		self.modMarker = kernel['reprMarkers']
-		self.seqMarker = kernel['seqMarker']
-
-		self.layer = kernel['layer']
-		self.trace = kernel['trace']
-
-		self.seqId = kernel['seqId']
-		self.altSeqId = kernel['altSeqId']
-
-		self.dir = kernel['dir']
-		self.sub = kernel['subSeqId']
-
-		self.mod = "na"
-		self.op = "na"
-		self.params = {"na":"na"}
-		self.tc = "na"
-		self.flops = 0
-		self.bytes = 0
-
-	def setParams(self, params):
-		#Remove space from params
-		qaz = ""
-		for key,value in params.items():
-			if "type" not in key:
-				qaz += "{}={},".format(key,value)
-			else:
-				if type(value) is str:
-					qaz += "{},".format(Utility.typeToString(value))
-				else:
-					qaz += "{}".format(value)
-
-		self.params = qaz.replace(" ", "")
-
--- a/apex/pyprof/prof/dropout.py
+++ b/apex/pyprof/prof/dropout.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Dropout(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "dropout")
-		#assert (len(args) == 1)
-
-		self.shape = args[0]['shape']
-		self.type  = args[0]['dtype']
-		self.dir = d.dir
-
-		return
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def bytes(self):
-		#Ignoring the cost of writing and reading the mask
-		return Utility.typeToBytes(self.type) * self.elems() * 2
-
-	def flops(self):
-		# Note: This is approximate and depends on the RNG
-		return 5*self.elems()
--- a/apex/pyprof/prof/embedding.py
+++ b/apex/pyprof/prof/embedding.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Embedding(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "embedding")
-
-		self.ishape = args[0]['shape']
-		self.itype = args[0]['dtype']
-
-		self.eshape = args[1]['shape']
-		self.etype = args[1]['dtype']
-
-		assert (len(self.eshape) == 2)
-
-		self.dir = d.dir
-		self.sub = d.sub
-		return
-
-	def params(self):
-		p = OrderedDict([('I', self.ishape), ('itype', self.itype), ('E', self.eshape), ('etype', self.etype)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def bytes(self):
-		ishape = self.ishape
-		itype = self.itype
-		eshape = self.eshape
-		etype = self.etype
-
-		ielems = Utility.numElems(ishape)
-
-		b = 0
-		if self.dir == "fprop":
-			#indices
-			b += ielems * Utility.typeToBytes(itype)
-			#read and write the embedding matrix
-			b += ielems * eshape[1] * 2 * Utility.typeToBytes(etype)
-		else:
-			#3 times the size of the incoming gradient
-			b = ielems * eshape[1] * 3 * Utility.typeToBytes(etype)
-
-			if self.sub > 0:
-				b = 0
-
-		return b
-
-	def flops(self):
-		# Note: not implemented yet
-		return 0
--- a/apex/pyprof/prof/index_slice_join_mutate.py
+++ b/apex/pyprof/prof/index_slice_join_mutate.py
-from collections import OrderedDict
-from .utility import Utility
-import numpy as np
-from .base import OperatorLayerBase
-
-class Cat(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch")
-		assert (op == "cat")
-		assert (len(args) >= 2)
-
-		t = args[0]['dtype']
-		shapes = []
-
-		for arg in args:
-			if arg['type'] == "tensor":
-				assert (arg['dtype'] == t)
-				shapes.append(arg['shape'])
-
-		self.type = t
-		self.shapes = shapes
-
-	def params(self):
-		p = OrderedDict([('T', self.shapes), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		b = 0
-		for s in self.shapes:
-			b += Utility.numElems(s)
-		return 2 * b * Utility.typeToBytes(self.type)
-
-class Reshape(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "reshape")
-
-		#Temporarily commenting three lines
-		#assert (len(args) == 2)
-		#t,s = args
-		#assert s['type'] == "tuple"
-
-		t = args[0]
-		assert t['type'] == "tensor"
-		self.type = t['dtype']
-		self.shape = t['shape']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		return 0
-
-class Gather(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor") or (mod == "torch")
-		assert (op == "gather")
-
-		#Filter out the "out" parameter
-		args = list(filter(lambda x : x['name'] != 'out', args))
-		assert (len(args) == 3)
-
-		#Get input
-		if (args[0]['name'] == ""):
-			arg = args[0]
-		else:
-			arg = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		assert (arg['type'] == "tensor")
-
-		self.shape = arg['shape']
-		self.type = arg['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		return 2 * Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
-
-class MaskedScatter(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "masked_scatter_")
-		assert (len(args) == 3)
-
-		dst, mask, src = args
-		assert (dst['type'] == mask['type'] == src['type'] == "tensor")
-		assert (mask['dtype'] == "uint8")
-		assert (dst['dtype'] == src['dtype'])
-		assert (dst['shape'] == mask['shape'])
-
-		self.shape = dst['shape']
-		self.type = dst['dtype']
-		self.seqId = d.seqId
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		elems = Utility.numElems(self.shape)
-
-		#src and dst
-		b = 2 * elems * Utility.typeToBytes(self.type)
-
-		#mask (uint8)
-		b += elems
-
-		if (self.seqId > 0):
-			b = 0
-		return b
-
-class Nonzero(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod in ["torch", "Tensor"])
-		assert (op == "nonzero")
-		assert (len(args) == 1)
-
-		arg = args[0]
-		self.shape = arg['shape']
-		self.type = arg['dtype']
-		self.seqId = d.seqId
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		elems = Utility.numElems(self.shape)
-		dim = len(self.shape)
-
-		#input tensor
-		b = elems * Utility.typeToBytes(self.type)
-
-		#in the worst case, the output is a (elems x dim) tensor of type "long"
-		b += elems * dim * Utility.typeToBytes("int64")
-
-		if self.seqId > 0:
-			return 0
-		else:
-			return b
-
-class IndexSelect(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor") or (mod == "torch")
-		assert (op == "index_select")
-
-		#Filter out the "out" parameter
-		args = list(filter(lambda x : x['name'] != 'out', args))
-		assert (len(args) == 3)
-
-		#Get input, dim and index
-		if (args[0]['name'] == ""):
-			t = args[0]
-		else:
-			t = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		if (args[1]['name'] == ""):
-			d = args[1]
-		else:
-			d = list(filter(lambda x : x['name'] == "dim", args))[0]
-
-		if (args[2]['name'] == ""):
-			i = args[2]
-		else:
-			i = list(filter(lambda x : x['name'] == "index", args))[0]
-
-		assert (t['type'] == i['type'] == "tensor")
-		assert (d['type'] == "int")
-		assert (i['dtype'] == "int64")
-		assert (len(i['shape']) == 1)
-
-		shape = t['shape']
-		dim = d['value']
-		indices = i['shape'][0]
-		assert (dim < len(shape))
-
-		self.shape = shape
-		self.dim = dim
-		self.indices = indices
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape),('D', self.dim),('I', self.indices),('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		#determine the shape of the output tensor
-		shape = list(self.shape)
-		shape[self.dim] = self.indices
-
-		b = 0
-
-		#time to read the input and write the output
-		elems = Utility.numElems(shape)
-		b += 2 * elems * Utility.typeToBytes(self.type)
-
-		#time to read the indices
-		b += self.indices * Utility.typeToBytes("int64")
-
-		return b
-
-class MaskedSelect(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-		self.sub = d.sub
-
-		assert (mod == "Tensor") or (mod == "torch")
-		assert (op == "masked_select")
-
-		#Filter out the "out" parameter
-		args = list(filter(lambda x : x['name'] != 'out', args))
-		assert (len(args) == 2)
-
-		#Get input and mask
-		if (args[0]['name'] == ""):
-			t = args[0]
-		else:
-			t = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		if (args[1]['name'] == ""):
-			m = args[1]
-		else:
-			m = list(filter(lambda x : x['name'] == "mask", args))[0]
-
-		assert (m['dtype'] == "uint8")
-
-		tensor = t['shape']
-		mask = m['shape']
-
-		#check for broadcast condition
-		if (tensor != mask):
-			array1 = np.empty(list(tensor))
-			array2 = np.empty(list(mask))
-			try:
-				out = np.broadcast(array1, array2).shape
-			except:
-				assert False
-
-		self.tshape = tensor
-		self.mshape = mask
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.tshape),('M', self.mshape),('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		tensor = self.tshape
-		mask = self.mshape
-		t = self.type
-
-		#in the worst case, #output elements = #input elements
-		b = 2 * Utility.numElems(tensor) * Utility.typeToBytes(t)
-
-		#mask tensor (assuming uint8)
-		b += Utility.numElems(mask)
-		return b
-
-	def flops(self):
-		return 0
--- a/apex/pyprof/prof/linear.py
+++ b/apex/pyprof/prof/linear.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Linear(OperatorLayerBase):
-
-	'''
-	Notes:
-	If the bias occurs before the GEMM, then its 1 write (bias expansion).
-	If the bias occurs after, then its 1 read and 1 write.
-	bias in bprop is a reduction and hence is 1 read.
-	'''
-
-	gemmKernels = ["gemm", "gemv", "dot_kernel", "splitKreduce_kernel", "reduce_1Block_kernel"]
-	biasKernels = ["kernelReduceContigDim", "kernelReduceNoncontigDim_shared", "elementwise_kernel", "reduce_kernel"]
-
-	def setXWBMNK(self, args):
-		x = None
-		w = None
-		b = None
-		if (len(args) == 2):
-			x,w = args
-		elif (len(args) == 3):
-			x,w,b = args
-			assert (x['type'] == w['type'] == "tensor")
-			if (b['type'] == "tensor"):
-				assert(len(b['shape']) == 1)
-			elif (b['type'] == "NoneType"):
-				assert b['value'] is None
-				b = None
-			else:
-				assert False
-		else:
-			assert False
-
-		assert(len(w['shape']) == 2)
-		k1 = x['shape'][-1]
-		n,k2 = w['shape']
-		assert(k1 == k2)
-		if b is not None:
-			assert(b['shape'][0] == n)
-		t1 = x['dtype']
-		t2 = w['dtype']
-		assert(t1 == t2)
-
-		# X, W, B
-		self.x = x['shape']
-		self.w = w['shape']
-		self.b = b['shape'] if b is not None else None
-		self.type = t1
-
-		# M, N, K
-		#n = Utility.numElems(x[0:-1])
-		n = self.x[0:-1]
-		k = self.x[-1]
-		m,k1 = self.w
-		assert (k == k1)
-
-		self.m = m
-		self.n = n
-		self.k = k
-
-	def tc(self):
-		if self.op() == "linear":
-			return 1 if "884gemm" in self.name else 0
-		else:
-			return "-"
-
-	def __init__(self, d):
-		self.name = d.name
-		self.dir = d.dir
-		self.sub = d.sub
-
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "linear")
-
-		self.setXWBMNK(args)
-
-		if any(x in d.name for x in Linear.gemmKernels):
-			self.op_ = "linear"
-		else:
-			assert (d.name in Linear.biasKernels)
-			self.op_ = "bias"
-
-		'''
-		elif (("kernelPointwiseApply2" in d.name) or ("kernelReduceContigDim" in d.name) or ("kernelReduceNoncontigDim_shared" in d.name)):
-			#bias expansion was before the gemm
-			self.op_ = "bias"
-
-		elif ("elementwise_kernel" in d.name):
-			#Bias addition happens later with a broadcast tensor
-			self.op_ = "bias"
-			assert (len(d.argMarker) == 2)
-			marker = eval(d.argMarker[1])
-			mod = marker['mod']
-			op = marker['op']
-			args = marker['args']
-
-			assert (mod == "Tensor")
-			assert (op == "__iadd__")
-			assert (len(args) == 2)
-			mn = args[0]['shape']
-			b = args[1]['shape']
-			assert (len(b) == 1)
-
-			assert (mn == (self.n + (self.m,)))
-			assert (b == self.b)
-
-		else:
-			assert False
-		'''
-
-	def params(self):
-		#p = OrderedDict([('X', self.x), ('W', self.w), ('B', self.b), ('type', self.type)])
-
-		m, n, k, x, w, t = self.m, self.n, self.k, self.x, self.w, self.type
-		if len(n) == 1:
-			n = n[0]
-
-		if self.op_ == "linear":
-			if self.dir == "fprop":
-				p = OrderedDict([('M', m), ('N', n), ('K', k), ('type', t)])
-			elif self.dir == "bprop":
-				if self.sub == 0:		#dgrad (most likely)
-					p = OrderedDict([('M', k), ('N', n), ('K', m), ('type', t)])
-				elif self.sub == 1:	#wgrad (most likely)
-					p = OrderedDict([('M', k), ('N', m), ('K', n), ('type', t)])
-				else:
-					#This happens when there are additional kernels for reduction
-					p = OrderedDict([('X', x), ('W', w), ('type', t)])
-			else:
-				assert False
-
-		elif self.op_ == "bias":
-			p = OrderedDict([('M', m), ('N', n), ('type', t)])
-		else:
-			assert False
-		return p
-
-	def op(self):
-		return self.op_
-
-	def bytesFlops(self):
-
-		m = self.m
-		n = Utility.numElems(self.n)
-		k = self.k
-
-		if self.op_ == "linear":
-			if self.dir == "fprop":
-				f = m * n * k * 2
-				b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
-			elif self.dir == "bprop":
-				if self.sub == 0:		#dgrad (most likely)
-					f = m * n * k * 2
-					b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
-				elif self.sub == 1:	#wgrad (most likely)
-					f = m * n * k * 2
-					b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
-				else:
-					#This happens when there are additional kernels for reduction
-					f = 0
-					b = 0
-			else:
-				assert False
-
-		elif self.op_ == "bias":
-			f = m * n
-			b = 2 * m * n * Utility.typeToBytes(self.type)
-		else:
-			assert False
-		return b,f
-
-	def bytes(self):
-		b, f = self.bytesFlops()
-		return b
-
-	def flops(self):
-		b, f = self.bytesFlops()
-		return f
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/loss.py
+++ b/apex/pyprof/prof/loss.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-#TODO: Add support for additional loss functions.
-
-class MSELoss(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "mse_loss")
-		assert (len(args) == 3)
-
-		#Get input, target and reduction
-		if (args[0]['name'] == ""):
-			x = args[0]
-		else:
-			x = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		if (args[1]['name'] == ""):
-			y = args[1]
-		else:
-			y = list(filter(lambda x : x['name'] == "target", args))[0]
-
-		if (args[2]['name'] == ""):
-			r = args[2]
-		else:
-			r = list(filter(lambda x : x['name'] == "reduction", args))[0]
-
-		assert (x['type'] == y['type'] == "tensor")
-		assert (x['shape'] == y['shape'])
-		assert (x['dtype'] == y['dtype'])
-		assert (r['type'] == "str")
-		assert (r['value'] in ["none", "mean", "sum"])
-
-		self.shape = x['shape']
-		self.type = x['dtype']
-		self.red = r['value']
-		self.dir = d.dir
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type), ('red', self.red)])
-		return p
-
-	def elems(self):
-		red = self.red
-		e = Utility.numElems(self.shape)
-
-		if self.dir == "fprop":
-			if red == "none":
-				e *= 3
-			else:
-				e *= 2
-		else:
-			if red == "none":
-				e *= 4
-			else:
-				e *= 3
-		return e
-
-	def bytes(self):
-		return self.elems() * Utility.typeToBytes(self.type)
-
-	def flops(self):
-		return self.elems() * 2 + 1
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/misc.py
+++ b/apex/pyprof/prof/misc.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Foo(OperatorLayerBase):
-	"""
-	An object of Foo is instantiated when we detect an unsupported operator.
-	"""
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		shapes = []
-		types = []
-
-		for arg in args:
-			if arg['type'] == "tensor":
-				shapes.append(arg['shape'])
-				types.append(arg['dtype'])
-
-		self.shape = shapes
-		self.type = types
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		return 0
-
-class Copy(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "copy_")
-		assert (len(args) == 2)
-
-		dst, src = args
-		assert (src['type'] == dst['type'])
-		assert (src['shape'] == dst['shape'])
-
-		self.shape = src['shape']
-		self.stype = src['dtype']
-		self.dtype = dst['dtype']
-
-	def params(self):
-		#The data type might be different
-		p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def flops(self):
-		return 0
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def bytes(self):
-		return self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
-
-class Clone(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "clone")
-		assert (len(args) == 1)
-		t = args[0]
-		self.shape = t['shape']
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def bytes(self):
-		return 2 * self.elems() * Utility.typeToBytes(self.type)
-
-class Contiguous(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "contiguous")
-		assert (len(args) == 1)
-		t = args[0]
-		self.shape = t['shape']
-		self.type = t['dtype']
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		return 2 * Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-class Any(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "Tensor")
-		assert (op == "any")
-		assert (len(args) == 1)	#could be 2 as well, the second argument is a bool
-		t = args[0]
-
-		self.shape = t['shape']
-		self.type = t['dtype']
-		self.sub = d.sub
-		return
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		return Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
--- a/apex/pyprof/prof/normalization.py
+++ b/apex/pyprof/prof/normalization.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class BatchNorm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (op == "batch_norm")
-		assert (len(args) == 8)
-		i = args[0]
-		assert (i['type'] == "tensor")
-
-		self.shape = i['shape']
-		self.type = i['dtype']
-		self.dir = d.dir
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def flops(self):
-		# Variance algo-dependent, but this is a reasonable value.
-		return self.elems() * 8
-
-	def bytes(self):
-		e = self.elems()
-		if self.dir == "fprop":
-			e *= 4
-		else:
-			e *= 5
-
-		return e * Utility.typeToBytes(self.type)
--- a/apex/pyprof/prof/optim.py
+++ b/apex/pyprof/prof/optim.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-#TODO: Add support for other optimizers.
-
-class Adam(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert(op == "adam")
-		assert (len(args) == 12) or (len(args) == 14)
-		w, hw, m, v, g = args[0:5]
-		assert (w['shape'] == m['shape'] == v['shape'] == g['shape'])
-		assert (hw['shape'] == w['shape']) or (hw['shape'] == (0,))		#hw could be null
-		assert (w['type'] == m['type'] == v['type'] == g['type'] == hw['type'] == "tensor")
-		assert (w['dtype'] == m['dtype'] == v['dtype'] == "float32")
-
-		self.w = w
-		self.g = g
-
-	def params(self):
-		p = OrderedDict([('T',self.w['shape']), ('wtype',self.w['dtype']), ('gtype',self.g['dtype'])])
-		return p
-
-	def flops(self):
-		return 0
-
-	def bytes(self):
-		wshape = self.w['shape']
-		wtype = self.w['dtype']
-		gtype = self.g['dtype']
-		b = 0
-
-		elems = Utility.numElems(wshape)
-
-		#Get time to stream read/write w, m, v
-		b += 6 * elems *  Utility.typeToBytes(wtype)
-
-		#Get time to read "g"
-		b += elems * Utility.typeToBytes(gtype)
-
-		if wtype != gtype: #mixed precision
-			#Get time to write "hw
-			b += elems * Utility.typeToBytes(gtype)
-
-		return b
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
--- a/apex/pyprof/prof/output.py
+++ b/apex/pyprof/prof/output.py
-import errno, os, sys
-
-class Output():
-	"""
-	This class handles printing of a columed output and a CSV.
-	"""
-
-	# The table below is organized as 
-	# user_option: [output_header, attribute_in_Data_class, type, min_width_in_columed_output]
-	table = {
-		"idx":		["Idx",			"index",	int,	7],
-		"seq":		["SeqId",		"seqId",	str,	7],
-		"altseq":	["AltSeqId",	"altSeqId",	str,	7],
-		"tid":		["TId",			"tid",		int,	12],
-		"layer":	["Layer", 		"layer",	str,	10],
-		"trace":	["Trace",		"trace",	str,	25],
-		"dir":		["Direction",	"dir",		str,	5],
-		"sub":		["Sub",			"sub",		int,	3],
-		"mod":		["Module",		"mod",		str,	15],
-		"op":		["Op",			"op",		str,	15],
-		"kernel":	["Kernel",		"name",		str,	0],
-		"params":	["Params",		"params",	str,	0],
-		"sil":		["Sil(ns)",		"sil",		int,	10],
-		"tc":		["TC",			"tc",		str,	2],
-		"device":	["Device",		"device",	int,	3],
-		"stream":	["Stream",		"stream",	int,	3],
-		"grid":		["Grid",		"grid",		str,	12],
-		"block":	["Block",		"block",	str,	12],
-		"flops":	["FLOPs", 		"flops",	int,	12],
-		"bytes":	["Bytes",		"bytes", 	int,	12]
-	}
-
-	def __init__(self, args):
-		self.cols = args.c
-		self.csv = args.csv
-		self.col = True if (args.w > 0) else False
-		self.width = args.w
-
-		w = 0
-		for col in self.cols:
-			assert col in Output.table.keys()
-			w += Output.table[col][3]
-
-		if ((self.col) and (w > self.width)):
-			print("Minimum width required to print {} = {}. Exiting.".format(",".join(self.cols), w))
-			sys.exit(1)
-
-		remainder = self.width - w
-
-		if ("kernel" in self.cols) and ("params" in self.cols):
-			Output.table["kernel"][3] = int(remainder/2)
-			Output.table["params"][3] = int(remainder/2)
-		elif ("kernel" in self.cols):
-			Output.table["kernel"][3] = remainder
-		elif ("params" in self.cols):
-			Output.table["params"][3] = remainder
-
-		#header format
-		cadena = ""
-		for col in self.cols:
-			_,_,t,w = Output.table[col]
-			cadena += "%-{}.{}s ".format(w,w)
-
-		self.hFormat = cadena
-
-		#data format
-		cadena = ""
-		for col in self.cols:
-			_,_,t,w = Output.table[col]
-			if (t == str):
-				cadena += "%-{}.{}s ".format(w,w)
-			elif (t == int):
-				cadena += "%{}d ".format(w)
-
-		self.dFormat = cadena
-
-	def foo(self, cadena, pformat):
-		if self.csv:
-			cadena = ",".join(map(lambda x : '"' + str(x) + '"', cadena))
-		elif self.col:
-			cadena = pformat % cadena
-		else:
-			cadena = " ".join(map(str,cadena))
-
-		try:
-			print(cadena)
-		except IOError as e:
-			#gracefully handle pipes
-			if e.errno == errno.EPIPE:
-				# Python flushes standard streams on exit; redirect remaining output
-				# to devnull to avoid another BrokenPipeError at shutdown
-
-				devnull = os.open(os.devnull, os.O_WRONLY)
-				os.dup2(devnull, sys.stdout.fileno())
-				sys.exit(0)
-			else:
-				sys.exit(-1)
-
-	def header(self):
-		cadena = ()
-		for col in self.cols:
-			h = Output.table[col][0]
-			cadena = cadena + (h,)
-
-		self.foo(cadena, self.hFormat)
-
-	def data(self, a):
-		if a.dir == "":
-			direc = "na"
-		else:
-			direc = a.dir
-
-		if a.op == "":
-			op = "na"
-		else:
-			op = a.op
-
-		if a.mod == "":
-			mod = "na"
-		else:
-			mod = a.mod
-
-		cadena = ()
-		for col in self.cols:
-			attr = Output.table[col][1]
-			val = getattr(a, attr)
-
-			if col == "layer":
-				assert(type(val) == list)
-				val = ":".join(val)
-				val = "-" if val == "" else val
-
-			if col == "trace":
-				assert(type(val) == list)
-				if self.col and len(val):
-					val = val[-1]
-					val = val.split("/")[-1]
-				else:
-					val = ",".join(val)
-					val = "-" if val == "" else val
-
-			if col in ["seq", "altseq"]:
-				assert(type(val) == list)
-				val = ",".join(map(str,val))
-				val = "-" if val == "" else val
-
-			cadena = cadena + (val,)
-	
-		self.foo(cadena, self.dFormat)
--- a/apex/pyprof/prof/pointwise.py
+++ b/apex/pyprof/prof/pointwise.py
-import numpy as np
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Pointwise(OperatorLayerBase):
-
-	ops = []
-	ops += ["__abs__", "__neg__", "__invert__"]
-	ops += ["__add__", "__sub__", "__mul__", "__floordiv__", "__truediv__", "__pow__", "__mod__"]
-	ops += ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__"]
-	ops += ["__iadd__", "__isub__", "__imul__", "__itruediv__",]
-	ops += ["__lt__", "__gt__", "__ge__", "__le__", "__eq__", "__ne__",]
-	ops += ["lt", "lt_", "gt", "gt_", "ge", "ge_", "le", "le_", "eq", "eq_", "ne", "ne_",]
-	ops += ["__and__", "__or__", "__xor__", "__lshift__", "__rshift__"]
-	ops += ["__iand__", "__ior__", "__ixor__", "__ilshift__", "__irshift__"]
-	ops += ["abs", "abs_", "neg", "neg_"]
-	ops += ["add", "add_", "div", "div_", "mul", "mul_", "reciprocal", "reciprocal_", "remainder", "remainder_", "sub", "sub_",]
-	ops += ["addcdiv", "addcdiv_", "addcmul", "addcmul_"]
-	ops += ["exp", "exp_", "exp1m", "exp1m_", "log", "log_", "log10", "log10_", "log1p", "log1p_", "log2", "log2_", "pow", "pow_", "rsqrt", "rsqrt_", "sqrt", "sqrt_",]
-	ops += ["ceil", "ceil_", "clamp", "clamp_", "floor", "floor_", "fmod", "fmod_", "frac", "frac_", "round", "round_", "sign", "sign_", "trunc", "trunc_"]
-	ops += ["acos", "acos_", "asin", "asin_", "atan", "atan_", "atan2", "atan2_", "cos", "cos_", "cosh", "cosh_", "sin", "sin_", "sinh", "sinh_", "tan", "tan_", "sigmoid", "sigmoid_", "tanh", "tanh_"]
-	ops += ["digamma", "erf", "erf_", "erfc", "erfc_", "erfinv", "erfinv_", "lerp", "lerp_", "mvlgamma",]
-
-	@staticmethod
-	def foo(d):
-		return d['name'],d['type'],d['shape'],d['dtype']
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		self.dir = d.dir
-		assert (d.dir in ["fprop", "bprop"])
-		assert (op in Pointwise.ops)
-
-		#Filter out all named parameters (kwargs).
-		#This might require revisiting in future.
-		args = list(filter(lambda x : x['name'] == "", args))
-
-		#Filter out non tensors
-		args = list(filter(lambda x : x['type'] == "tensor", args))
-
-		if (len(args) == 0):
-			self.shape = [(1,)]
-			self.type = "float32" #FIX
-
-		elif (len(args) == 1):
-			in0 = args[0]
-			_,t0,s0,dt0 = Pointwise.foo(in0)
-			assert (t0 == "tensor")
-			self.shape = [s0,]
-			self.type = dt0
-
-		elif (len(args) == 2):
-			in0,in1 = args
-			_,t0,s0,dt0 = Pointwise.foo(in0)
-			_,t1,s1,dt1 = Pointwise.foo(in1)
-			assert (t0 == t1 == "tensor")
-			assert (dt0 == dt1)
-			self.shape = [s0,s1]
-			self.type = dt0
-
-		elif (len(args) == 3):
-			in0,in1,in2 = args
-			_,t0,s0,dt0 = Pointwise.foo(in0)
-			_,t1,s1,dt1 = Pointwise.foo(in1)
-			_,t2,s2,dt2 = Pointwise.foo(in2)
-			assert (t0 == t1 == t2 == "tensor")
-			assert (dt0 == dt1 == dt2)
-			self.shape = [s0,s1,s2]
-			self.type = dt0
-		else:
-			assert False
-		return
-
-	def params(self):
-		p = OrderedDict([('T',self.shape), ('type', self.type)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def elems(self):
-		tensor = self.shape
-		t = self.type
-
-		if (len(tensor) == 1):
-			elems = 2 * Utility.numElems(tensor[0])
-		elif (len(tensor) == 2):
-			if (tensor[0] == tensor[1]):	# same shape
-				elems = Utility.numElems(tensor[0])
-				if self.dir == "fprop":
-					elems *= 3
-				else:
-					if (self.op_ in ["add", "__add__", "sub", "__sub__", "__isub__"]):
-						elems *= 2
-					elif (self.op_ in ["__mul__", "__rmul__", "div", "__truediv__"]):
-						elems *= 3
-					else:
-						assert False
-			else:	#check for broadcast conditions
-				array1 = np.empty(list(tensor[0]))
-				array2 = np.empty(list(tensor[1]))
-				try:
-					out = np.broadcast(array1, array2).shape
-				except:
-					assert False
-
-				elems = Utility.numElems(tensor[0])
-				elems += Utility.numElems(tensor[1])
-				elems += Utility.numElems(out)
-				#TODO bprop
-		elif (len(tensor) == 3):
-			if (tensor[0] == tensor[1] == tensor[2]):	#same shape
-				elems = Utility.numElems(tensor[0])
-				elems *= 4
-			else:
-				assert False
-		else:
-			assert False
-
-		return elems
-
-	def bytes(self):
-		return self.elems() * Utility.typeToBytes(self.type)
-
-	def flops(self):
-		# Note: some cases may still be missing.
-
-		f = 0
-		if self.op_ in ["__abs__", "__neg__", "__add__", "__sub__", "__mul__",
-					"__radd__", "__rmul__", "__iadd__", "__isub__", "__imul__", "__itruediv__",
-					"abs", "abs_", "neg", "neg_", "add", "add_", "div", "div_", "mul", "mul_",
-					"sub", "sub_", "exp", "exp_", "sign", "sign_", "trunc", "trunc_",
-					"sin", "sin_", "cos", "cos_", "sinh", "sinh_", "cosh", "cosh_",
-					"sqrt", "sqrt_", "rsqrt", "rsqrt_", "__lt__", "__gt__", "__ge__", "__le__",
-					"__eq__", "__ne__", "lt", "lt_", "gt", "gt_", "ge", "ge_", "le", "le_",
-					"eq", "eq_", "ne", "ne_", "ceil", "ceil_", "clamp", "clamp_", "floor", "floor_",
-					"round", "sign", "sign_", "trunc", "trunc_"]:
-			# We're counting only one operand, not two (2 operands, 1 op)
-			f = self.elems() / 2
-		elif self.op_ in ["fmod", "fmod_"]:
-			f = self.elems()
-		elif self.op_ in ["tanh", "tanh_", "sigmoid", "sigmoid_", "log", "log_", "log2",
-			 "log2_", "log10", "log10_"]:
-			f = self.elems() * 2
-		elif self.op_ in ["asin", "asin_", "acos", "acos_", "atan", "atan_"]:
-			# no intrinsic, hence slow execution
-			# surprisingly, asin/acos and atan were all the same (via nvprof measurement)
-			f = self.elems() * 10
-
-		return f
--- a/apex/pyprof/prof/pooling.py
+++ b/apex/pyprof/prof/pooling.py
-from .collections import OrderedDict
-from .utility import Utility
-
-# Work in progress.
-
-#poolFuncs = ["max_pool2d_with_indices_forward", "max_pool2d_with_indices"]
-class MaxPool2d(object):
-
-	def parse(marker):
-
-		def convert2Tuple(arg):
-			assert (arg['type'] in ["int", "tuple"])
-			if arg['type'] == "int":
-				return (arg['value'], arg['value'])
-			else:
-				return arg['value']
-
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-		assert (mod == "torch.nn.functional")
-		assert (op == "max_pool2d")
-		assert (len(args) >= 2)
-
-		#input
-		assert (args[0]['name'] == "")
-		inp = args[0]
-		assert (inp['type'] == "tensor")
-		i = inp['shape']
-		t = inp['dtype']
-		assert (len(i) == 4) #nchw tensor
-
-		#kernel
-		if (args[1]['name'] == ""):
-			k = args[1]
-		else:
-			k = list(filter(lambda x : x['name'] == "kernel_size", args))[0]
-		k = convert2Tuple(k)
-
-		#stride
-		s = k #default value
-		if ((len(args) >= 3) and args[2] == ""):
-			s = args[2]
-			s = convert2Tuple(s)
-		elif any(x['name'] == "stride" for x in args):
-			s = list(filter(lambda x : x['name'] == "stride", args))[0]
-			s = convert2Tuple(s)
-
-		#padding
-		p = (0,0)
-		if ((len(args) >= 4) and args[3] == ""):
-			p = args[3]
-			p = convert2Tuple(p)
-		elif any(x['name'] == "padding" for x in args):
-			p = list(filter(lambda x : x['name'] == "padding", args))[0]
-			p = convert2Tuple(p)
-		
-		params = OrderedDict([('T', i), ('K', k), ('s',s), ('p',p), ('type', t)])
-		return params
--- a/apex/pyprof/prof/prof.py
+++ b/apex/pyprof/prof/prof.py
-#!/usr/bin/env python3
-
-"""
-This script reads the output (Python dictionary) created by parse.py.
-For every kernel (line) in the input it determines
-	module / class name e.g. torch.nn.functional
-	operator name e.g. linear
-	kernel parameters e.g. GEMM M, N, K, datatype
-	bytes
-	flops
-	tensor core usage
-	direction (fprop, bprop)
-	and other things. Please see the tool usage.
-"""
-
-from .usage import parseArgs
-from .output import Output
-from .utility import Utility
-from .pointwise import Pointwise
-from .convert import Convert
-from .blas import *
-from .embedding import Embedding
-from .reduction import *
-from .dropout import Dropout
-from .softmax import *
-#from pooling import * # work in progress
-from .linear import Linear
-from .optim import Adam
-from .misc import *
-from .conv import Conv
-from .activation import Activation
-from .index_slice_join_mutate import Cat, Reshape, MaskedScatter, Gather, Nonzero, IndexSelect, MaskedSelect
-from .recurrentCell import RNNCell
-from .normalization import BatchNorm
-from .randomSample import RandPerm
-from .loss import MSELoss
-from .data import Data
-
-def findFpropKernel(seq):
-	#Find the last fprop kernel with the same seqId
-	#First look at seqId and then at altSeqId
-	for idx in reversed(range(len(kernels))):
-		k = kernels[idx]
-		if (seq in k['seqId']) and (k['dir'] == "fprop"):
-			return idx
-
-	for idx in reversed(range(len(kernels))):
-		k = kernels[idx]
-		if (seq in k['altSeqId']) and (k['dir'] == "fprop"):
-			return idx
-
-	return -1
-	#print("Error: seqId {} not found.".format(seq), file=sys.stderr)
-	#assert False
-
-def foo(mod, op, d):
-	if (op[0] == "linear"):
-		xx = Linear(d)
-
-	# rnncell, lstmcell, grucell
-	elif (mod[0] in["LSTMCell", "GRUCell"]) and (op[0] == "forward"):
-		xx = RNNCell(d)
-
-	elif op[0] in ["conv1d", "conv2d",]:
-		xx = Conv(d)
-
-	elif (op[0] in Pointwise.ops):
-		xx = Pointwise(d)
-
-	elif (op[0] in Convert.ops):
-		xx = Convert(d)
-
-	elif op[0] in ["__matmul__", "matmul"]:
-		xx = Matmul(d)
-
-	elif op[0] == "embedding":
-		xx = Embedding(d)
-
-	#reduction
-	elif op[0] == "sum":
-		xx = Sum(d)
-
-	elif op[0] == "mean":
-		xx = Mean(d)
-
-	elif op[0] == "norm":
-		xx = Norm(d)
-
-	elif op[0] == "dropout":
-		xx = Dropout(d)
-
-	#Index, Slice, Join, Mutate
-	elif (op[0] == "cat"):
-		xx = Cat(d)
-
-	elif (op[0] == "reshape"):
-		xx = Reshape(d)
-
-	elif (op[0] == "masked_scatter_"):
-		xx = MaskedScatter(d)
-
-	elif (op[0] == "gather"):
-		xx = Gather(d)
-
-	elif (op[0] == "nonzero"):
-		xx = Nonzero(d)
-
-	elif (op[0] == "index_select"):
-		xx = IndexSelect(d)
-
-	elif (op[0] == "masked_select"):
-		xx = MaskedSelect(d)
-
-	#blas
-	elif op[0] in ["addmm", "addmm_"]:
-		xx = Addmm(d)
-
-	elif op[0] == "mm":
-		xx = Mm(d)
-
-	elif op[0] == "bmm":
-		xx = Bmm(d)
-
-	#softmax
-	elif op[0] == "softmax":
-		xx = Softmax(d)
-
-	elif op[0] == "log_softmax":
-		xx = LogSoftmax(d)
-
-	#loss
-	elif op[0] == "mse_loss":
-		xx = MSELoss(d)
-
-	#optimizers
-	elif op[0] == "adam":
-		xx = Adam(d)
-
-	#normalization
-	elif op[0] == "batch_norm":
-		xx = BatchNorm(d)
-
-	#random
-	elif op[0] == "randperm":
-		xx = RandPerm(d)
-
-	#misc
-	elif op[0] == "copy_":
-		xx = Copy(d)
-
-	elif op[0] == "clone":
-		xx = Clone(d)
-
-	elif op[0] == "contiguous":
-		xx = Contiguous(d)
-
-	elif op[0] == "any":
-		xx = Any(d)
-
-	elif (op[0] in Activation.ops):
-		xx = Activation(d)
-
-	elif op[0] == "to":
-		xx = Convert(d)
-
-	else:
-		xx = Foo(d)
-
-	return xx
-
-def main():
-	#Read cmd line arguments
-	cmdArgs = parseArgs()
-
-	output = Output(cmdArgs)
-	output.header()
-
-	idx = -1
-	#Read in all the kernel info
-	for line in cmdArgs.file:
-		idx += 1
-		kernel = eval(line)
-		assert(kernel)
-		kernels.append(kernel)
-
-		k = kernel
-		d = Data(k)
-
-		mod = k['mod']
-		op = k['op']
-
-		flops = 0
-		params = {"na":"na"}
-		tc = "na"
-		bytes = 0
-
-		if (d.dir == "bprop"):
-			d.seqMarker = k['seqMarker']
-			seq = k['seqId']
-			if len(seq) > 1:
-				pass
-			seq = k['seqId'][:1]
-			assert (len(seq) == 1), seq
-			#assert (seq[0] != 0)
-			assert (len(d.seqMarker) > 0)
-			#If there is no useful marker associated, use the
-			#sequence number to find the kernel from fprop
-			if len(d.argMarker) == 0:
-				index = findFpropKernel(seq[0])
-				if index >= 0:
-					d.argMarker = kernels[index]['marker']
-					d.modMarker = kernels[index]['reprMarkers']
-					mod = kernels[index]['mod']
-					op = kernels[index]['op']
-
-					d.layer = kernels[index]['layer']
-					d.trace = kernels[index]['trace']
-
-		# Check if marker has our annotations
-		if len(d.argMarker) and Utility.hasNVTX(d.argMarker[0]):
-
-			xx = foo(mod, op, d)
-
-			bytes = xx.bytes()
-			flops = xx.flops()
-			op = xx.op()
-			params = xx.params()
-			tc = xx.tc()
-
-		if type(op) is list:
-			if len(op):
-				op = op[0]
-			else:
-				op = ""
-
-		if type(mod) is list:
-			if len(mod):
-				mod = mod[0]
-			else:
-				mod = ""
-
-		d.index = idx+1
-
-		# The following 8 come from operator class functions.
-		d.setParams(params)
-		d.tc = tc
-		d.flops = flops
-		d.bytes = bytes
-		d.mod = mod
-		d.op = op
-
-		output.data(d)
-
-kernels = []
-if __name__ == '__main__':
-	main()
--- a/apex/pyprof/prof/randomSample.py
+++ b/apex/pyprof/prof/randomSample.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class RandPerm(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch")
-		assert (op == "randperm")
-		assert (len(args) == 1)
-		n = args[0]
-		assert n['type'] == "int"
-		self.n = n['value']
-
-	def params(self):
-		p = OrderedDict([('N', self.n)])
-		return p
-
-	def tc(self):
-		return "-"
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def bytes(self):
-		return self.n * Utility.typeToBytes("int64")
-
-	def flops(self):
-		# Depends on RNG but this is probably a reasonable assumption.
-		return self.n * 3
--- a/apex/pyprof/prof/recurrentCell.py
+++ b/apex/pyprof/prof/recurrentCell.py
--- a/apex/pyprof/prof/reduction.py
+++ b/apex/pyprof/prof/reduction.py
--- a/apex/pyprof/prof/softmax.py
+++ b/apex/pyprof/prof/softmax.py
-from collections import OrderedDict
-from .utility import Utility
-from .base import OperatorLayerBase
-
-class Softmax(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "softmax")
-
-		#Filter out named parameters
-		args = list(filter(lambda x : x['name'] == '', args))
-
-		assert (len(args) <= 2)
-		self.shape = args[0]['shape']
-		self.type = args[0]['dtype']
-		self.dir = d.dir
-
-		return
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def flops(self):
-		# Note: exp, sum-reduce, divide
-		#flops = elems * 3
-		return 0
-
-	def bytes(self):
-		b = self.elems() * Utility.typeToBytes(self.type)
-		b *= 3 if self.dir == "fprop" else 5 #verify
-		return b
-
-class LogSoftmax(OperatorLayerBase):
-
-	def __init__(self, d):
-		marker = eval(d.argMarker[0])
-		mod = marker['mod']
-		op = marker['op']
-		args = marker['args']
-
-		self.marker = marker
-		self.mod_ = mod
-		self.op_ = op
-		self.args = args
-
-		assert (mod == "torch.nn.functional")
-		assert (op == "log_softmax")
-
-		#Filter out named parameters
-		args = list(filter(lambda x : x['name'] == '', args))
-
-		assert (len(args) <= 2)
-
-		#Get input
-		if (args[0]['name'] == ""):
-			i = args[0]
-		else:
-			i = list(filter(lambda x : x['name'] == "input", args))[0]
-
-		t = i['dtype']
-
-		self.shape = i['shape']
-		self.type = i['dtype']
-		self.dir = d.dir
-		return
-
-	def op(self):
-		return self.op_
-
-	def mod(self):
-		return self.mod_
-
-	def tc(self):
-		return "-"
-
-	def params(self):
-		p = OrderedDict([('T', self.shape), ('type', self.type)])
-		return p
-
-	def elems(self):
-		return Utility.numElems(self.shape)
-
-	def flops(self):
-		# Note: exp, sum-reduce, divide, log
-		#flops = elems * 4
-		return 0
-
-	def bytes(self):
-		b = self.elems() * Utility.typeToBytes(self.type)
-		b *= 3 if self.dir == "fprop" else 5 #verify
-		return b