Unverified Commit 96850dfa authored by Jithun Nair's avatar Jithun Nair Committed by GitHub
Browse files

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29
parents 87fc4125 cc5f83b5
import warnings
try:
from .parse import main
except ImportError as e:
warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)")
raise e
if __name__ == '__main__':
main()
import sys, sqlite3
class DB(object):
"""
This class provides functions for DB operations
with exception handling.
"""
def __init__(self, dbFile):
try:
conn = sqlite3.connect(dbFile)
conn.row_factory = sqlite3.Row
c = conn.cursor()
except:
print("Error opening {}".format(dbFile))
sys.exit(1)
self.conn = conn
self.c = c
def select(self, cmd):
try:
self.c.execute(cmd)
#rows = self.c.fetchall()
rows = [dict(row) for row in self.c.fetchall()]
except sqlite3.Error as e:
print(e)
sys.exit(1)
except:
print("Uncaught error in SQLite access while executing {}".format(cmd))
sys.exit(1)
#print(rows)
return rows
def insert(self, cmd, data):
try:
self.c.execute(cmd, data)
except sqlite3.Error as e:
print(e)
sys.exit(1)
except:
print("Uncaught error in SQLite access while executing {}".format(cmd))
sys.exit(1)
def execute(self, cmd):
try:
self.c.execute(cmd)
except sqlite3.Error as e:
print(e)
sys.exit(1)
except:
print("Uncaught error in SQLite access while executing {}".format(cmd))
sys.exit(1)
def commit(self):
self.conn.commit()
def close(self):
self.c.close()
self.conn.close()
import cxxfilt, struct, binascii
#Helper functions
def demangle(name):
"""
Demangle a C++ string
"""
return cxxfilt.demangle(name)
def encode_object_id(pid, tid):
"""
Given process id (pid) and thread id (tid), return the object id.
object id = pid (little endian 4 bytes) + tid (little endian 8 bytes)
"""
objId = struct.pack('<i', pid) + struct.pack('<q',tid)
objId = binascii.hexlify(objId).decode('ascii').upper()
return objId
def getShortName(name):
"""
Returns a shorter kernel name
"""
sname = name.split("<")[0] \
.replace("void ", "") \
.replace("at::","") \
.replace("cuda::", "") \
.replace("native::","") \
.replace("(anonymous namespace)::", "")
sname = sname.split("(")[0]
return sname
class Kernel(object):
"""
This class stores information about a kernel.
"""
kernels = []
profStart = 0
def __init__(self):
self.kNameId = None
self.kShortName = None
self.kLongName = None
self.kStartTime = None #GPU start time
self.kEndTime = None #GPU end time
self.kDuration = None
self.device = None
self.stream = None
self.grid = ()
self.block = ()
self.corrId = None
self.rStartTime = None #CPU start time
self.rEndTime = None #CPU end time
self.rDuration = None
self.tid = None
self.pid = None
self.objId = None
self.timeOffset = None
self.layerMarkers = []
self.traceMarkers = []
self.reprMarkers = []
self.pyprofMarkers = []
self.seqMarkers = []
self.otherMarkers = []
self.altMarkers = []
self.seqId = []
self.altSeqId = []
self.layer = []
self.subSeqId = None
self.dir = None
self.mod = []
self.op = []
def setKernelInfo(self, info):
self.kNameId = info['name']
self.corrId = int(info['correlationId'])
start = int(info['start'])
end = int(info['end'])
assert end > start, "This assertion can fail for very large profiles. It usually fails when start = end = 0."
self.kStartTime = start
self.kEndTime = end
self.kDuration = end - start
assert (start > Kernel.profStart)
self.device = int(info['deviceId'])
self.stream = int(info['streamId'])
self.grid = (info['gridX'], info['gridY'], info['gridZ'])
self.block = (info['blockX'], info['blockY'], info['blockZ'])
self.timeOffset = Kernel.profStart
def setKernelName(self, name):
cadena = demangle(name)
self.kLongName = cadena
self.kShortName = getShortName(cadena)
def setRunTimeInfo(self, info):
start, end, pid, tid = info
self.rStartTime = start
self.rEndTime = end
self.rDuration = end - start
self.pid = pid
self.tid = tid
self.objId = encode_object_id(pid, tid)
def setMarkerInfo(self, info):
self.layerMarkers, self.traceMarkers, self.reprMarkers, self.pyprofMarkers, self.seqMarkers, self.otherMarkers, self.altMarkers, self.seqId, self.altSeqId, self.layer = info
self.subSeqId = 0
def setDirection(self):
"""
Set direction (fprop, bprop) based on PyTorch sequence markers.
It is a heuristic and not a foolproof method.
"""
if any("Backward, seq = " in x for x in self.seqMarkers) or \
any("backward, seq = " in x for x in self.seqMarkers) or \
any("Backward0, seq = " in x for x in self.seqMarkers):
self.dir = "bprop"
else:
self.dir = "fprop"
def setOp(self):
"""
Detect and set the class/module (mod) and operation (op)
of the kernel e.g. torch.nn.functional / linear, torch / sigmoid.
The lookup sequence we use is
NVTX markers inserted by pyprof
NVTX markers inserted by PyTorch in bprop
NVTX markers inserted by PyTorch in fprop
It is a heuristic and not a foolproof method.
"""
def sanitize(name):
name = name.replace("torch","") \
.replace("autograd","") \
.replace("_backward","") \
.replace("::","") \
.replace("jit","") \
.replace("(anonymous namespace)","")
head, sep, tail = name.partition("Backward")
return head
#Check pyprof markers
for m in self.pyprofMarkers:
assert ("mod" in m) and ("op" in m) and ("args" in m)
t = eval(m)
self.op.append(t['op'])
self.mod.append(t['mod'])
if len(self.op):
return
#Check bprop kernel markers
for m in self.seqMarkers:
if ("backward, seq = " in m) or ("Backward, seq = " in m):
op = m.split(",")[0]
op = sanitize(op)
self.op.append(op)
self.mod.append('na')
if len(self.op):
return
#Check markers with "seq = "
for m in self.seqMarkers:
if ", seq = " in m:
op = m.split(",")[0]
self.op.append(op)
self.mod.append('na')
if len(self.op):
return
#If nothing else
if len(self.otherMarkers):
self.op.append(self.otherMarkers[0])
self.mod.append('na')
def print(self):
"""
Print kernel information. This is used by prof.py.
"""
a = lambda: None
a.kShortName = self.kShortName
a.kDuration = self.kDuration
#a.layerMarkers = self.layerMarkers
a.layer = self.layer
a.trace = self.traceMarkers
a.reprMarkers = self.reprMarkers
a.marker = self.pyprofMarkers
a.seqMarker = self.seqMarkers
a.seqId = self.seqId
a.subSeqId = self.subSeqId
a.altSeqId = self.altSeqId
a.dir = self.dir
a.mod = self.mod
a.op = self.op
a.tid = self.tid
a.device = self.device
a.stream = self.stream
a.grid = self.grid
a.block = self.block
a.kLongName = self.kLongName
print(a.__dict__)
import sys
class NVVP(object):
"""
This class gets kernel information from the SQL (nvvp) database.
"""
driverT = "CUPTI_ACTIVITY_KIND_DRIVER"
runtimeT = "CUPTI_ACTIVITY_KIND_RUNTIME"
kernelT = "CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL"
markerT = "CUPTI_ACTIVITY_KIND_MARKER"
stringT = "StringTable"
def __init__(self, db):
self.db = db
self.markerId = 0
def getProfileStart(self):
"""
Get the profile start time
"""
profStart = sys.maxsize
for table in [self.driverT, self.runtimeT, self.kernelT, self.markerT]:
colname = "timestamp" if table is self.markerT else "start"
cmd = "select {} from {} ORDER BY {} ASC LIMIT 1".format(colname, table, colname)
result = self.db.select(cmd)
assert(len(result) <= 1)
if (len(result) == 1):
assert(colname in result[0])
t = result[0][colname]
if (t < profStart):
profStart = t
assert(profStart < sys.maxsize)
return profStart
def getString(self, id_):
"""
Get the string associated with an id.
"""
cmd = "select value from {} where _id_ = {}".format(self.stringT, id_)
result = self.db.select(cmd)
assert (len(result) == 1)
return result[0]['value']
def createMarkerTable(self):
"""
Create a temporary table and index it to speed up repeated SQL quesries.
The table is an INNER JOIN of CUPTI_ACTIVITY_KIND_MARKER with itself.
"""
cmd = 'CREATE TEMPORARY TABLE marker AS SELECT \
a._id_ as id, \
a.timestamp AS startTime, \
b.timestamp AS endTime, \
HEX(a.objectId) AS objectId, \
a.name AS name \
FROM {} AS a INNER JOIN {} AS b ON \
a.id = b.id and \
a.flags = 2 and b.flags = 4'.format(self.markerT, self.markerT)
self.db.execute(cmd)
self.db.execute('CREATE INDEX start_index ON marker (startTime)')
self.db.execute('CREATE INDEX end_index ON marker (endTime)')
self.db.execute('CREATE INDEX id_index ON marker (id)')
def getCPUInfo(self, corrId):
"""
Given the correlation id, get CPU start, end, thread id, process id.
The information can be in the runtime table or the driver table.
"""
#First look in the runtime table
cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.runtimeT, corrId);
result = self.db.select(cmd)
assert (len(result) <= 1)
if (len(result) == 0):
#Look in the driver table
cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.driverT, corrId);
result = self.db.select(cmd)
assert (len(result) == 1)
info = result[0]
start = info['start']
end = info['end']
pid = info['processId']
tid = info['threadId']
tid = tid & 0xffffffff #convert to unsigned
assert (end > start)
return [start, end, pid, tid]
def getKernelInfo(self):
"""
Get GPU kernel info
"""
cmd = "select name,correlationId,start,end,deviceId,streamId,gridX,gridY,gridZ,blockX,blockY,blockZ from {}".format(self.kernelT)
result = self.db.select(cmd)
return result
def getMarkerInfo(self, objId, startTime, endTime):
"""
This function first finds all NVTX markers encapsulating
a runtime / driver kernel launch.
It then splits the markers into many lists.
layerMarkers : User added NVTX markers
traceMarkers : Call trace markers (inserted by pyprof)
reprMarkers : Markers containing the extra_repr() of a module (inserted by pyprof)
pyprofMarkers: Markers containing args and kwargs (tensor shape, datatype etc.)
seqMarkers : Markers containing PyTorch internal sequence markers (inserted by PyTorch)
altSeqMarkers: Markers inserted by PyTorch between two kernel launches. Needs better explanation.
otherMarkers : Markers not in either of the above categories.
We extract seqId from the seq and altSeq markers. The seqId is used in bprop.
We also extract information from the layerMarkers.
"""
layerMarkers = []
traceMarkers = []
reprMarkers = []
pyprofMarkers = []
seqMarkers = []
otherMarkers = []
altSeqMarkers = []
bprop = False
#Helper functions
def delete(objId, sTime):
"""
Delete rows from the temporary SQL table which are no longer required.
This speeds up future queries.
"""
margin = 0
cmd = 'DELETE FROM marker WHERE objectId = "{}" AND endTime < {}'.format(objId, sTime - margin)
#cmd = 'DELETE FROM marker WHERE endTime < {}'.format(sTime - margin)
self.db.execute(cmd)
def getLayerName(mlist):
"""
Get layer names from layer marker list.
"""
layers = []
assert(type(mlist) == list)
for m in mlist:
assert("layer:" in m)
l = m.split(":")[1]
layers.append(l)
return layers
def getSeqId(mlist):
"""
Get sequence ids from seq / alt seq marker list.
"""
ids = []
assert(type(mlist) == list)
for m in mlist:
assert(", seq = " in m)
seq = int(m.split("=")[1])
ids.append(seq)
#Remove duplicates
ids = list(set(ids))
ids.sort()
return ids
def seqcompare(elem):
"""
Sorting function for sequence markers
"""
assert (", seq = " in elem)
#sort by sequence id and then the string
l = elem.split(" = ")
return l[1] + l[0]
def prune(mlist):
"""
Remove markers with the same seqId and if the strings are similar.
This function works on a sorted sequence.
"""
assert (type(mlist) == list)
assert (len(mlist))
a = mlist[0:1]
for i in range(1,len(mlist)):
m = mlist[i]
pm = mlist[i-1]
name,seq = m.split(",")
pname,pseq = pm.split(",")
similar = (name in pname) or (pname in name)
if (seq == pseq) and similar:
continue
else:
a.append(m)
return a
def filterTrace(mlist):
"""
Filter trace markers to remove certain file names.
"""
assert (type(mlist) == list)
if len(mlist) == 0:
return mlist
mlist = mlist[-1] #The last stack trace will be a super set.
mlist = eval(mlist)
mlist = mlist['traceMarker']
assert (type(mlist) == list)
mlist = list(filter(lambda x : "/torch/nn/modules/" not in x, mlist))
mlist = list(filter(lambda x : "/torch/nn/functional.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/tensor.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/autograd/__init__.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/_jit_internal.py" not in x, mlist))
mlist = list(filter(lambda x : "/pyprof/nvtx/nvmarker.py" not in x, mlist))
mlist = list(filter(lambda x : "/apex/optimizers/" not in x, mlist))
mlist = list(filter(lambda x : "/torch/_utils.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/optim/" not in x, mlist))
return mlist
#Find all encapsulating markers
cmd = 'SELECT id,name from marker where \
objectId = "{}" and \
startTime < {} and \
endTime > {} \
ORDER BY startTime ASC'.format(objId, startTime, endTime)
result = self.db.select(cmd)
#Bin markers into different lists
for r in result:
m = self.getString(r['name'])
#Hack: If its a known gradient checkpointing marker, ignore it.
if m.find("CheckpointFunctionBackward") >= 0:
continue
if ("_backward, seq =" in m) or ("Backward, seq =" in m) or ("Backward0, seq =" in m):
bprop = True
if ("mod" in m) and ("op" in m) and ("args" in m) and ("type" in m):
pyprofMarkers.append(m)
elif ("layer:" in m):
layerMarkers.append(m)
elif ("traceMarker" in m):
traceMarkers.append(m)
elif ("strRepr" in m):
reprMarkers.append(m)
elif (", seq = " in m):
seqMarkers.append(m)
else:
otherMarkers.append(m)
#Remove duplicates, sort and prune seqMarkers
if (len(seqMarkers)):
seqMarkers = list(set(seqMarkers))
seqMarkers.sort(key=seqcompare)
seqMarkers = prune(seqMarkers)
#Remove duplicates from otherMarkers
otherMarkers = list(set(otherMarkers))
#Get markers with seq id (inserted by PyTorch) from the previous kernel to the present kernel
#Only for fprop kernels
if (len(result) and not bprop):
loId = self.markerId
hiId = result[-1]['id']
self.markerId = hiId
#Get markers between loId and hiId
cmd = 'SELECT id,name from marker where objectId = "{}" and id > {} and id < {} ORDER BY startTime ASC'.format(objId, loId, hiId)
result1 = self.db.select(cmd)
for r in result1:
m = self.getString(r['name'])
#Get only markers with seq id
if (", seq=" in m):
altSeqMarkers.append(m)
#Remove duplicates, sort and prune altSeqMarkers
if (len(altSeqMarkers)):
altSeqMarkers = list(set(altSeqMarkers))
altSeqMarkers.sort(key=seqcompare)
altSeqMarkers = prune(altSeqMarkers)
delete(objId, startTime)
return layerMarkers, filterTrace(traceMarkers), reprMarkers, pyprofMarkers, seqMarkers, otherMarkers, altSeqMarkers, getSeqId(seqMarkers), getSeqId(altSeqMarkers), getLayerName(layerMarkers)
#!/usr/bin/env python3
"""
Parse the SQL db and print a dictionary for every kernel.
"""
import sys
import argparse
from tqdm import tqdm
from .db import DB
from .kernel import Kernel
from .nvvp import NVVP
def parseArgs():
parser = argparse.ArgumentParser(prog=sys.argv[0], description="Parse SQL (nvvp) db.")
parser.add_argument("file",
type=str,
default=None,
help="SQL db (nvvp) file.")
args = parser.parse_args()
return args
def main():
args = parseArgs()
db = DB(args.file)
nvvp = NVVP(db)
kInfo = nvvp.getKernelInfo()
if len(kInfo) == 0:
print("Found 0 kernels. Exiting.", file=sys.stderr)
db.close()
sys.exit(0)
else:
print("Found {} kernels. Getting info for each kernel.".format(len(kInfo)), file=sys.stderr)
nvvp.createMarkerTable()
prevSeqId = -1
prevSubSeqId = -1
prevOp = "na"
Kernel.profStart = nvvp.getProfileStart()
for i in tqdm(range(len(kInfo)), ascii=True):
info = kInfo[i]
k = Kernel()
#Set kernel info
k.setKernelInfo(info)
#Get, set kernel name
name = nvvp.getString(k.kNameId)
k.setKernelName(name)
#Get runtime info
info = nvvp.getCPUInfo(k.corrId)
k.setRunTimeInfo(info)
#Get and set marker and seqid info
info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime)
k.setMarkerInfo(info)
#If the seqId contains both 0 and non zero integers, remove 0.
if any(seq != 0 for seq in k.seqId) and (0 in k.seqId):
k.seqId.remove(0)
#Set direction (it uses seq id)
k.setDirection()
#Set op
k.setOp()
#The following code is based on heuristics.
#TODO: Refactor.
#Assign subSeqId, adjust seqId and altSeqId
#seqId can be 0.
#A kernel can have multiple seqIds both in fprop and bprop.
#In bprop, seqIds might not decrease monotonically. I have observed a few blips.
if len(k.seqId):
assert (k.dir in ["fprop", "bprop"])
if (k.dir == "fprop"):
#Check if there is a sequence id larger than the previous
inc = (k.seqId[-1] > prevSeqId)
if inc:
currSeqId = [x for x in k.seqId if x > prevSeqId][0]
else:
currSeqId = prevSeqId
else:
currSeqId = k.seqId[0]
#if ((currSeqId == prevSeqId) and (k.op == prevOp)):
if ((currSeqId == prevSeqId) and (k.op == prevOp)) or ((k.op[0] == "forward") and (k.op == prevOp) and (k.mod[0] in ["LSTMCell", "GRUCell", "RNNCell"])):
#The second condition is to trap cases when pytorch does not use cudnn for a LSTMCell.
k.subSeqId = prevSubSeqId + 1
prevSeqId = currSeqId
prevSubSeqId = k.subSeqId
prevOp = k.op
#Keep currSeqId in k.seqId, move everything else to k.altSeqId
for s in k.seqId:
if s != currSeqId:
k.seqId.remove(s)
k.altSeqId.append(s)
for s in k.altSeqId:
if s == currSeqId:
k.altSeqId.remove(s)
k.altSeqId = list(set(k.altSeqId))
if (len(k.altSeqId)):
(k.altSeqId).sort()
k.print()
db.close()
if __name__ == '__main__':
main()
import warnings
try:
from .prof import main
except ImportError as e:
warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?")
raise e
if __name__ == '__main__':
main()
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Activation(OperatorLayerBase):
"""
This class handles the various activation functions.
"""
ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"]
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod in ["torch.nn.functional", "torch", "Tensor"])
#Filter out named parameters
args = list(filter(lambda x : x['name'] == '', args))
assert (len(args) >= 1)
arg = args[0]
assert (arg['type'] == "tensor")
self.i = arg
self.dir = d.dir
def params(self):
p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])])
return p
def flops(self):
direction = self.dir
tensor = self.i['shape']
t = self.i['dtype']
# TODO: revise
elems = Utility.numElems(tensor)
return elems
def bytes(self):
direction = self.dir
tensor = self.i['shape']
t = self.i['dtype']
elems = Utility.numElems(tensor)
elems = elems * (2 if direction == "fprop" else 3)
return elems * Utility.typeToBytes(t)
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
from abc import ABC, abstractmethod
class OperatorLayerBase(ABC):
"""
Base class for all layers and operators.
Every derived class should have the following functions.
"""
@abstractmethod
def tc(self):
"""
Tensor core usage by the kernel.
Return "1" (yes), "0" (no, but possible), "-" (not applicable)
"""
pass
@abstractmethod
def params(self):
"""
Kernel parameters to be printed.
"""
pass
@abstractmethod
def flops(self):
"""
Note that 1 FMA = 2 flops.
"""
pass
@abstractmethod
def bytes(self):
pass
@abstractmethod
def mod(self):
"""
Name of the module/class e.g. torch.nn.functional.
"""
pass
@abstractmethod
def op(self):
"""
Name of the operator e.g. sigmoid.
"""
pass
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
import numpy as np
TC_GEMMS = ["884gemm", "1688gemm"]
class Addmm(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod in ["torch", "Tensor",])
assert (op in ["addmm", "addmm_",])
#Get alpha and beta
alpha = 1
beta = 1
if any(x['name'] == 'alpha' for x in args):
alpha = list(filter(lambda x : x['name'] == "alpha", args))[0]
alpha = alpha['value']
if any(x['name'] == 'beta' for x in args):
beta = list(filter(lambda x : x['name'] == "beta", args))[0]
beta = beta['value']
self.alpha = alpha
self.beta = beta
#Filter out named parameters
args = list(filter(lambda x : x['name'] == '', args))
assert (len(args) == 3)
C,A,B = args
m,k1 = A['shape']
k2,n = B['shape']
assert (k1 == k2)
t1 = A['dtype']
t2 = B['dtype']
t3 = C['dtype']
assert(t1 == t2 == t3)
self.A = A
self.B = B
self.C = C
self.m = m
self.n = n
self.k = k1
self.type = t1
self.name = d.name
return
def tc(self):
for s in TC_GEMMS:
if s in self.name:
return 1
return 0
def bytes(self):
m, n, k = self.m, self.n, self.k
return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
def flops(self):
return self.m * self.n * self.k * 2
def op(self):
return self.op_
def mod(self):
return self.mod_
def params(self):
p = OrderedDict([('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
return p
class Bmm(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "torch") and (op == "bmm")
#Filter out named params (kwargs)
args = list(filter(lambda x : x['name'] == "", args))
assert (len(args) == 2)
A,B = args
b1,m,k1 = A['shape']
b2,k2,n = B['shape']
assert (b1 == b2)
assert (k1 == k2)
t1 = A['dtype']
t2 = B['dtype']
assert(t1 == t2)
self.A = A
self.B = B
self.b = b1
self.m = m
self.n = n
self.k = k1
self.type = t1
self.name = d.name
def tc(self):
for s in TC_GEMMS:
if s in self.name:
return 1
return 0
def params(self):
#p = OrderedDict([('A', A['shape']), ('B', B['shape']), ('type', t1)])
p = OrderedDict([('B',self.b), ('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
return p
def flops(self):
return self.b * self.m * self.n * self.k * 2
def bytes(self):
b, m, n, k = self.b, self.m, self.n, self.k
return Utility.typeToBytes(self.type) * b * (m*n + m*k + n*k)
def op(self):
return self.op_
def mod(self):
return self.mod_
class Matmul(OperatorLayerBase):
NON_GEMM = ["kernelPointwiseApply2", "reduce_1Block_kernel", "elementwise_kernel"]
NON_TC = NON_GEMM + ["dot_kernel"]
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
self.name = d.name
self.sub = d.sub
assert ((mod == "torch") and (op == "matmul")) or ((mod == "Tensor") and (op == "__matmul__"))
assert (len(args) == 2)
assert any([x in d.name for x in Matmul.NON_TC + ["gemm", "gemv"]])
A,B = args
t1 = A['dtype']
t2 = B['dtype']
assert(t1 == t2)
A = A['shape']
B = B['shape']
self.A = A
self.B = B
self.type = t1
# batch, MNK
if (len(A) == 1) and (len(B) == 1):
#dot product
assert (A[0] == B[0])
self.b = (1,)
self.m = 1
self.n = 1
self.k = A[0]
elif (len(A) == 2) and (len(B) == 2):
#gemm
m,k1 = A
k2,n = B
assert(k1 == k2)
self.b = (1,)
self.m = m
self.n = n
self.k = k1
elif (len(A) == 1) and (len(B) == 2):
#vector matrix
k1 = A[0]
k2,n = B
assert(k1 == k2)
self.b = (1,)
self.m = 1
self.n = n
self.k = k1
elif (len(A) == 2) and (len(B) == 1):
#gemv
m,k1 = A
k2 = B[0]
assert (k1 == k2)
self.b = (1,)
self.m = m
self.n = 1
self.k = k1
elif (len(A) == 1) and (len(B) > 2):
assert (A[0] == B[-2])
self.b = B[0:-2]
self.m = 1
self.n = B[-1]
self.k = B[-2]
elif (len(B) == 1) and (len(A) > 2):
assert (B[0] == A[-1])
self.b = A[0:-2]
self.m = A[-2]
self.n = 1
self.k = A[-1]
else:
assert (len(A) >= 2)
assert (len(B) >= 2)
assert (A[-1] == B[-2])
self.m = A[-2]
self.n = B[-1]
self.k = A[-1]
aa = np.empty(A[0:-2])
bb = np.empty(B[0:-2])
self.b = np.broadcast(aa, bb).shape
def params(self):
return OrderedDict([('A', self.A), ('B', self.B), ('type', self.type)])
def tc(self):
if self.name in Matmul.NON_TC:
return "-"
else:
for s in TC_GEMMS:
if s in self.name:
return 1
return 0
def bytes(self):
# TODO: check bytes for non-GEMM cases
if self.name in Matmul.NON_GEMM:
return 2 * Utility.typeToBytes(self.type) * Utility.numElems(self.A) #could be B as well
else:
m, n, k = self.m, self.n, self.k
return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
def flops(self):
# TODO: calculate actual FLOPs. At least we're not saying it's GEMM FLOPs for now.
if self.name in Matmul.NON_GEMM:
return 0
else:
return Utility.numElems(self.b) * self.m * self.n * self.k * 2
def op(self):
return self.op_
def mod(self):
return self.mod_
class Mm(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "torch") and (op == "mm")
assert (len(args) == 2)
A,B = args
m,k1 = A['shape']
k2,n = B['shape']
assert (k1 == k2)
t1 = A['dtype']
t2 = B['dtype']
assert(t1 == t2)
self.A = A
self.B = B
self.m = m
self.n = n
self.k = k1
self.type = t1
self.name = d.name
return
def params(self):
p = OrderedDict([('M',self.n),('N',self.m),('K',self.k),('type',self.type)])
return p
def tc(self):
for s in TC_GEMMS:
if s in self.name:
return 1
return 0
def bytes(self):
m, n, k = self.m, self.n, self.k
return Utility.typeToBytes(self.type) * (m*n + m*k + n*k)
def flops(self):
return self.m * self.n * self.k * 2
def op(self):
return self.op_
def mod(self):
return self.mod_
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Conv(OperatorLayerBase):
"""
# N = batch size
# C,H,W = input channels, height, width
# K,P,Q = output channels, height, width
# R,S = filter height, width
# g = groups
"""
#todo: refine winograd and FFT
convAuxList = ["nchwToNhwc", "nhwcToNchw", "OffsetsKernel",]
winoAuxList = ["generateWinogradTilesKernel", "winogradWgradData", "winogradWgradOutput", "winogradWgradDelta"]
fftAuxList = ["compute_gemm_pointers", "flip_filter", "fft2d_r2c_", "fft2d_c2r_", "fft1d_r2c", "fft1d_c2r"]
miscAuxList = ["scaleTensor_kernel",]
convList = ["_s884cudnn_", "_s1688cudnn_", "_scudnn_", "2d_grouped_direct_kernel", "cudnn::detail::implicit_convolve_sgemm", "cudnn::detail::dgrad2d_alg1_1", "cudnn::detail::wgrad_alg0_engine", "cudnn::detail::dgrad_engine", "dgrad_1x1_stride_2x2", "spatialDepthwiseConvolutionUpdateOutput"]
winoList = ["winograd3x3Kernel", "_sgemm_"]
fftList = ["fermiPlusCgemmLDS128_batched", "_gcgemm_",]
miscList = []
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
self.dir = d.dir
self.name = d.name
self.sub = d.sub
assert (mod == "torch.nn.functional")
assert (op in ["conv1d", "conv2d"])
length = len(args)
assert (length >= 2) and (length <= 7)
i,w = args[0], args[1]
assert (i['type'] == "tensor")
assert (w['type'] == "tensor")
#ignore bias
if (length >= 4) and (args[3]['name'] == ""):
s = args[3]
elif any(x['name'] == 'stride' for x in args):
s = list(filter(lambda x : x['name'] == 'stride', args))[0]
else:
s = {'name': 'stride', 'type': 'int', 'value': 1}
if (length >= 5) and (args[4]['name'] == ""):
p = args[4]
elif any(x['name'] == 'padding' for x in args):
p = list(filter(lambda x : x['name'] == 'padding', args))[0]
else:
p = {'name': 'padding', 'type': 'int', 'value': 0}
if (length >= 6) and (args[5]['name'] == ""):
d = args[5]
elif any(x['name'] == 'dilation' for x in args):
d = list(filter(lambda x : x['name'] == 'dilation', args))[0]
else:
d = {'name': 'dilation', 'type': 'int', 'value': 1}
if (length == 7) and (args[6]['name'] == ""):
g = args[6]
elif any(x['name'] == 'groups' for x in args):
g = list(filter(lambda x : x['name'] == 'groups', args))[0]
else:
g = {'name': 'groups', 'type': 'int', 'value': 1}
if op == "conv1d":
assert (len(i['shape']) == 3)
assert (len(w['shape']) == 3)
assert (i['dtype'] == w['dtype'])
N, C1, W = i['shape']
K, C2, S = w['shape']
assert (C1 == C2)
p = p['value'] if Utility.isscalar(p['type']) else p['value'][0]
s = s['value'] if Utility.isscalar(s['type']) else s['value'][0]
d = d['value'] if Utility.isscalar(d['type']) else d['value'][0]
g = g['value']
assert (g == 1)
H = 1
R = 1
P = 1 + (H - (((R-1))+1))
Q = 1 + (W + 2*p - (((S-1)*d)+1))/s
P = int(P)
Q = int(Q)
if (H == 1):
assert (P == 1)
if (W == 1):
assert (Q == 1)
self.N = N
self.C = C1
self.H = H
self.W = W
self.K = K
self.P = P
self.Q = Q
self.R = R
self.S = S
self.ph = 0
self.pw = p
self.U = 1
self.V = s
self.dh = 1
self.dw = d
self.g = g
self.type = i['dtype']
elif op == "conv2d":
assert (len(i['shape']) == 4)
assert (len(w['shape']) == 4)
assert (i['dtype'] == w['dtype'])
N, C1, H, W = i['shape']
K, C2, R, S = w['shape']
if Utility.isscalar(p['type']):
ph = pw = p['value']
else:
assert (p['type'] == "tuple")
ph, pw = p['value']
if Utility.isscalar(s['type']):
sh = sw = s['value']
else:
assert (s['type'] == "tuple")
sh, sw = s['value']
if Utility.isscalar(d['type']):
dh = dw = d['value']
else:
assert (d['type'] == "tuple")
dh, dw = d['value']
g = g['value']
assert (g >= 1)
assert (C1 == C2*g)
P = 1 + (H + 2*ph - (((R-1)*dh)+1))/sh
Q = 1 + (W + 2*pw - (((S-1)*dw)+1))/sw
P = int(P)
Q = int(Q)
if (H == 1):
assert (P == 1)
if (W == 1):
assert (Q == 1)
self.N = N
self.C = C1
self.H = H
self.W = W
self.K = K
self.P = P
self.Q = Q
self.R = R
self.S = S
self.ph = ph
self.pw = pw
self.U = sh
self.V = sw
self.dh = dh
self.dw = dw
self.g = g
self.type = i['dtype']
else:
assert False
def params(self):
p = OrderedDict([('N',self.N), ('C',self.C), ('H',self.H), ('W',self.W), ('K',self.K), ('P',self.P), ('Q',self.Q), ('R',self.R), ('S',self.S), ('ph',self.ph), ('pw',self.pw), ('U',self.U), ('V',self.V), ('dh',self.dh), ('dw',self.dw), ('g',self.g), ('type',self.type)])
return p
def conv_bytes_flops(self, N, C, H, W, K, P, Q, R, S, g, t):
f = 2*N*K*P*Q*C*R*S/g #for fprop
elems = N*C*H*W + K*C*R*S/g + N*K*P*Q
b = elems * Utility.typeToBytes(t)
return b,f
def bytes_flops(self):
N,C,H,W,K,P,Q,R,S,ph,pw,U,V,dh,dw,g,t = self.params().values()
if any(x in self.name for x in Conv.convAuxList+Conv.winoAuxList+Conv.fftAuxList+Conv.miscAuxList):
bytes, flops = [0, 0]
elif any(x in self.name for x in Conv.convList+Conv.winoList+Conv.fftList+Conv.miscList):
if g == 1:
bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
else:
if "2d_grouped_direct_kernel" in self.name: #only 1 kernel is called
bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
elif "spatialDepthwiseConvolutionUpdateOutput" in self.name: #one kernel for separable conv
bytes, flops = self.conv_bytes_flops(N,C,H,W,K,P,Q,R,S,g,t)
else: #a kernel per group is called
bytes, flops = self.conv_bytes_flops(N,C/g,H,W,K/g,P,Q,R,S,1,t)
elif ("calc_bias_diff" in self.name): #bias gradient
elems = N*K*P*Q
flops = elems
bytes = 2 * elems * Utility.typeToBytes(t)
#params = OrderedDict([('N',N), ('K',K), ('P',P), ('Q',Q), ('type', t)])
else:
bytes, flops = [0, 0]
return bytes, flops
def bytes(self):
b,_ = self.bytes_flops()
return b
def flops(self):
_,f = self.bytes_flops()
return f
def tc(self):
for s in ["884cudnn", "1688cudnn"]:
if s in self.name:
return 1
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Convert(OperatorLayerBase):
"""
Class to handle convert operations.
"""
ops = ["byte", "char", "double", "float", "half", "int", "long", "short", "to"]
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op in Convert.ops)
assert (len(args) == 1)
#The argument could be a tensor or scalar
t = args[0]
if t['type'] == "tensor":
shape = t['shape']
stype = t['dtype']
else:
shape = (1,)
stype = t['type']
if self.op_ == "to":
op = stype
self.shape = shape
self.stype = stype
self.dtype = op
def params(self):
p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
return p
def op(self):
return self.op_
def mod(self):
return self.mod_
def tc(self):
return "-"
def elems(self):
return Utility.numElems(self.shape)
def flops(self):
return 0
def bytes(self):
b = self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
return b
from .utility import Utility
class Data(object):
"""
Class to store all the data for every kernel e.g. name, bytes, flops, device, stream etc.
"""
def __init__(self, kernel):
#Available from NVprof
self.tid = kernel['tid']
self.device = kernel['device']
self.stream = kernel['stream']
self.grid = str(kernel['grid']).replace(" ","").replace("(","").replace(")","")
self.block = str(kernel['block']).replace(" ","").replace("(","").replace(")","")
self.name = kernel['kShortName'].replace(" ","_")
self.lName = kernel['kLongName']
self.sil = kernel['kDuration'] #units ns
self.index = None
#Markers
self.argMarker = kernel['marker']
self.modMarker = kernel['reprMarkers']
self.seqMarker = kernel['seqMarker']
self.layer = kernel['layer']
self.trace = kernel['trace']
self.seqId = kernel['seqId']
self.altSeqId = kernel['altSeqId']
self.dir = kernel['dir']
self.sub = kernel['subSeqId']
self.mod = "na"
self.op = "na"
self.params = {"na":"na"}
self.tc = "na"
self.flops = 0
self.bytes = 0
def setParams(self, params):
#Remove space from params
qaz = ""
for key,value in params.items():
if "type" not in key:
qaz += "{}={},".format(key,value)
else:
if type(value) is str:
qaz += "{},".format(Utility.typeToString(value))
else:
qaz += "{}".format(value)
self.params = qaz.replace(" ", "")
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Dropout(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "torch.nn.functional")
assert (op == "dropout")
#assert (len(args) == 1)
self.shape = args[0]['shape']
self.type = args[0]['dtype']
self.dir = d.dir
return
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def op(self):
return self.op_
def mod(self):
return self.mod_
def tc(self):
return "-"
def elems(self):
return Utility.numElems(self.shape)
def bytes(self):
#Ignoring the cost of writing and reading the mask
return Utility.typeToBytes(self.type) * self.elems() * 2
def flops(self):
# Note: This is approximate and depends on the RNG
return 5*self.elems()
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Embedding(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "torch.nn.functional")
assert (op == "embedding")
self.ishape = args[0]['shape']
self.itype = args[0]['dtype']
self.eshape = args[1]['shape']
self.etype = args[1]['dtype']
assert (len(self.eshape) == 2)
self.dir = d.dir
self.sub = d.sub
return
def params(self):
p = OrderedDict([('I', self.ishape), ('itype', self.itype), ('E', self.eshape), ('etype', self.etype)])
return p
def op(self):
return self.op_
def mod(self):
return self.mod_
def tc(self):
return "-"
def bytes(self):
ishape = self.ishape
itype = self.itype
eshape = self.eshape
etype = self.etype
ielems = Utility.numElems(ishape)
b = 0
if self.dir == "fprop":
#indices
b += ielems * Utility.typeToBytes(itype)
#read and write the embedding matrix
b += ielems * eshape[1] * 2 * Utility.typeToBytes(etype)
else:
#3 times the size of the incoming gradient
b = ielems * eshape[1] * 3 * Utility.typeToBytes(etype)
if self.sub > 0:
b = 0
return b
def flops(self):
# Note: not implemented yet
return 0
from collections import OrderedDict
from .utility import Utility
import numpy as np
from .base import OperatorLayerBase
class Cat(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "torch")
assert (op == "cat")
assert (len(args) >= 2)
t = args[0]['dtype']
shapes = []
for arg in args:
if arg['type'] == "tensor":
assert (arg['dtype'] == t)
shapes.append(arg['shape'])
self.type = t
self.shapes = shapes
def params(self):
p = OrderedDict([('T', self.shapes), ('type', self.type)])
return p
def flops(self):
return 0
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def bytes(self):
b = 0
for s in self.shapes:
b += Utility.numElems(s)
return 2 * b * Utility.typeToBytes(self.type)
class Reshape(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op == "reshape")
#Temporarily commenting three lines
#assert (len(args) == 2)
#t,s = args
#assert s['type'] == "tuple"
t = args[0]
assert t['type'] == "tensor"
self.type = t['dtype']
self.shape = t['shape']
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def flops(self):
return 0
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def bytes(self):
return 0
class Gather(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor") or (mod == "torch")
assert (op == "gather")
#Filter out the "out" parameter
args = list(filter(lambda x : x['name'] != 'out', args))
assert (len(args) == 3)
#Get input
if (args[0]['name'] == ""):
arg = args[0]
else:
arg = list(filter(lambda x : x['name'] == "input", args))[0]
assert (arg['type'] == "tensor")
self.shape = arg['shape']
self.type = arg['dtype']
def params(self):
p = OrderedDict([('T', self.shape),('type', self.type)])
return p
def flops(self):
return 0
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def bytes(self):
return 2 * Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
class MaskedScatter(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op == "masked_scatter_")
assert (len(args) == 3)
dst, mask, src = args
assert (dst['type'] == mask['type'] == src['type'] == "tensor")
assert (mask['dtype'] == "uint8")
assert (dst['dtype'] == src['dtype'])
assert (dst['shape'] == mask['shape'])
self.shape = dst['shape']
self.type = dst['dtype']
self.seqId = d.seqId
def params(self):
p = OrderedDict([('T', self.shape),('type', self.type)])
return p
def flops(self):
return 0
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def bytes(self):
elems = Utility.numElems(self.shape)
#src and dst
b = 2 * elems * Utility.typeToBytes(self.type)
#mask (uint8)
b += elems
if (self.seqId > 0):
b = 0
return b
class Nonzero(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod in ["torch", "Tensor"])
assert (op == "nonzero")
assert (len(args) == 1)
arg = args[0]
self.shape = arg['shape']
self.type = arg['dtype']
self.seqId = d.seqId
def params(self):
p = OrderedDict([('T', self.shape),('type', self.type)])
return p
def flops(self):
return 0
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def bytes(self):
elems = Utility.numElems(self.shape)
dim = len(self.shape)
#input tensor
b = elems * Utility.typeToBytes(self.type)
#in the worst case, the output is a (elems x dim) tensor of type "long"
b += elems * dim * Utility.typeToBytes("int64")
if self.seqId > 0:
return 0
else:
return b
class IndexSelect(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor") or (mod == "torch")
assert (op == "index_select")
#Filter out the "out" parameter
args = list(filter(lambda x : x['name'] != 'out', args))
assert (len(args) == 3)
#Get input, dim and index
if (args[0]['name'] == ""):
t = args[0]
else:
t = list(filter(lambda x : x['name'] == "input", args))[0]
if (args[1]['name'] == ""):
d = args[1]
else:
d = list(filter(lambda x : x['name'] == "dim", args))[0]
if (args[2]['name'] == ""):
i = args[2]
else:
i = list(filter(lambda x : x['name'] == "index", args))[0]
assert (t['type'] == i['type'] == "tensor")
assert (d['type'] == "int")
assert (i['dtype'] == "int64")
assert (len(i['shape']) == 1)
shape = t['shape']
dim = d['value']
indices = i['shape'][0]
assert (dim < len(shape))
self.shape = shape
self.dim = dim
self.indices = indices
self.type = t['dtype']
def params(self):
p = OrderedDict([('T', self.shape),('D', self.dim),('I', self.indices),('type', self.type)])
return p
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def flops(self):
return 0
def bytes(self):
#determine the shape of the output tensor
shape = list(self.shape)
shape[self.dim] = self.indices
b = 0
#time to read the input and write the output
elems = Utility.numElems(shape)
b += 2 * elems * Utility.typeToBytes(self.type)
#time to read the indices
b += self.indices * Utility.typeToBytes("int64")
return b
class MaskedSelect(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
self.sub = d.sub
assert (mod == "Tensor") or (mod == "torch")
assert (op == "masked_select")
#Filter out the "out" parameter
args = list(filter(lambda x : x['name'] != 'out', args))
assert (len(args) == 2)
#Get input and mask
if (args[0]['name'] == ""):
t = args[0]
else:
t = list(filter(lambda x : x['name'] == "input", args))[0]
if (args[1]['name'] == ""):
m = args[1]
else:
m = list(filter(lambda x : x['name'] == "mask", args))[0]
assert (m['dtype'] == "uint8")
tensor = t['shape']
mask = m['shape']
#check for broadcast condition
if (tensor != mask):
array1 = np.empty(list(tensor))
array2 = np.empty(list(mask))
try:
out = np.broadcast(array1, array2).shape
except:
assert False
self.tshape = tensor
self.mshape = mask
self.type = t['dtype']
def params(self):
p = OrderedDict([('T', self.tshape),('M', self.mshape),('type', self.type)])
return p
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def bytes(self):
tensor = self.tshape
mask = self.mshape
t = self.type
#in the worst case, #output elements = #input elements
b = 2 * Utility.numElems(tensor) * Utility.typeToBytes(t)
#mask tensor (assuming uint8)
b += Utility.numElems(mask)
return b
def flops(self):
return 0
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Linear(OperatorLayerBase):
'''
Notes:
If the bias occurs before the GEMM, then its 1 write (bias expansion).
If the bias occurs after, then its 1 read and 1 write.
bias in bprop is a reduction and hence is 1 read.
'''
gemmKernels = ["gemm", "gemv", "dot_kernel", "splitKreduce_kernel", "reduce_1Block_kernel"]
biasKernels = ["kernelReduceContigDim", "kernelReduceNoncontigDim_shared", "elementwise_kernel", "reduce_kernel"]
def setXWBMNK(self, args):
x = None
w = None
b = None
if (len(args) == 2):
x,w = args
elif (len(args) == 3):
x,w,b = args
assert (x['type'] == w['type'] == "tensor")
if (b['type'] == "tensor"):
assert(len(b['shape']) == 1)
elif (b['type'] == "NoneType"):
assert b['value'] is None
b = None
else:
assert False
else:
assert False
assert(len(w['shape']) == 2)
k1 = x['shape'][-1]
n,k2 = w['shape']
assert(k1 == k2)
if b is not None:
assert(b['shape'][0] == n)
t1 = x['dtype']
t2 = w['dtype']
assert(t1 == t2)
# X, W, B
self.x = x['shape']
self.w = w['shape']
self.b = b['shape'] if b is not None else None
self.type = t1
# M, N, K
#n = Utility.numElems(x[0:-1])
n = self.x[0:-1]
k = self.x[-1]
m,k1 = self.w
assert (k == k1)
self.m = m
self.n = n
self.k = k
def tc(self):
if self.op() == "linear":
return 1 if "884gemm" in self.name else 0
else:
return "-"
def __init__(self, d):
self.name = d.name
self.dir = d.dir
self.sub = d.sub
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
assert (mod == "torch.nn.functional")
assert (op == "linear")
self.setXWBMNK(args)
if any(x in d.name for x in Linear.gemmKernels):
self.op_ = "linear"
else:
assert (d.name in Linear.biasKernels)
self.op_ = "bias"
'''
elif (("kernelPointwiseApply2" in d.name) or ("kernelReduceContigDim" in d.name) or ("kernelReduceNoncontigDim_shared" in d.name)):
#bias expansion was before the gemm
self.op_ = "bias"
elif ("elementwise_kernel" in d.name):
#Bias addition happens later with a broadcast tensor
self.op_ = "bias"
assert (len(d.argMarker) == 2)
marker = eval(d.argMarker[1])
mod = marker['mod']
op = marker['op']
args = marker['args']
assert (mod == "Tensor")
assert (op == "__iadd__")
assert (len(args) == 2)
mn = args[0]['shape']
b = args[1]['shape']
assert (len(b) == 1)
assert (mn == (self.n + (self.m,)))
assert (b == self.b)
else:
assert False
'''
def params(self):
#p = OrderedDict([('X', self.x), ('W', self.w), ('B', self.b), ('type', self.type)])
m, n, k, x, w, t = self.m, self.n, self.k, self.x, self.w, self.type
if len(n) == 1:
n = n[0]
if self.op_ == "linear":
if self.dir == "fprop":
p = OrderedDict([('M', m), ('N', n), ('K', k), ('type', t)])
elif self.dir == "bprop":
if self.sub == 0: #dgrad (most likely)
p = OrderedDict([('M', k), ('N', n), ('K', m), ('type', t)])
elif self.sub == 1: #wgrad (most likely)
p = OrderedDict([('M', k), ('N', m), ('K', n), ('type', t)])
else:
#This happens when there are additional kernels for reduction
p = OrderedDict([('X', x), ('W', w), ('type', t)])
else:
assert False
elif self.op_ == "bias":
p = OrderedDict([('M', m), ('N', n), ('type', t)])
else:
assert False
return p
def op(self):
return self.op_
def bytesFlops(self):
m = self.m
n = Utility.numElems(self.n)
k = self.k
if self.op_ == "linear":
if self.dir == "fprop":
f = m * n * k * 2
b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
elif self.dir == "bprop":
if self.sub == 0: #dgrad (most likely)
f = m * n * k * 2
b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
elif self.sub == 1: #wgrad (most likely)
f = m * n * k * 2
b = m*n + m*k + n*k * Utility.typeToBytes(self.type)
else:
#This happens when there are additional kernels for reduction
f = 0
b = 0
else:
assert False
elif self.op_ == "bias":
f = m * n
b = 2 * m * n * Utility.typeToBytes(self.type)
else:
assert False
return b,f
def bytes(self):
b, f = self.bytesFlops()
return b
def flops(self):
b, f = self.bytesFlops()
return f
def mod(self):
return self.mod_
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
#TODO: Add support for additional loss functions.
class MSELoss(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "torch.nn.functional")
assert (op == "mse_loss")
assert (len(args) == 3)
#Get input, target and reduction
if (args[0]['name'] == ""):
x = args[0]
else:
x = list(filter(lambda x : x['name'] == "input", args))[0]
if (args[1]['name'] == ""):
y = args[1]
else:
y = list(filter(lambda x : x['name'] == "target", args))[0]
if (args[2]['name'] == ""):
r = args[2]
else:
r = list(filter(lambda x : x['name'] == "reduction", args))[0]
assert (x['type'] == y['type'] == "tensor")
assert (x['shape'] == y['shape'])
assert (x['dtype'] == y['dtype'])
assert (r['type'] == "str")
assert (r['value'] in ["none", "mean", "sum"])
self.shape = x['shape']
self.type = x['dtype']
self.red = r['value']
self.dir = d.dir
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type), ('red', self.red)])
return p
def elems(self):
red = self.red
e = Utility.numElems(self.shape)
if self.dir == "fprop":
if red == "none":
e *= 3
else:
e *= 2
else:
if red == "none":
e *= 4
else:
e *= 3
return e
def bytes(self):
return self.elems() * Utility.typeToBytes(self.type)
def flops(self):
return self.elems() * 2 + 1
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Foo(OperatorLayerBase):
"""
An object of Foo is instantiated when we detect an unsupported operator.
"""
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
shapes = []
types = []
for arg in args:
if arg['type'] == "tensor":
shapes.append(arg['shape'])
types.append(arg['dtype'])
self.shape = shapes
self.type = types
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def flops(self):
return 0
def bytes(self):
return 0
class Copy(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op == "copy_")
assert (len(args) == 2)
dst, src = args
assert (src['type'] == dst['type'])
assert (src['shape'] == dst['shape'])
self.shape = src['shape']
self.stype = src['dtype']
self.dtype = dst['dtype']
def params(self):
#The data type might be different
p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
return p
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def flops(self):
return 0
def elems(self):
return Utility.numElems(self.shape)
def bytes(self):
return self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
class Clone(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op == "clone")
assert (len(args) == 1)
t = args[0]
self.shape = t['shape']
self.type = t['dtype']
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def flops(self):
return 0
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def elems(self):
return Utility.numElems(self.shape)
def bytes(self):
return 2 * self.elems() * Utility.typeToBytes(self.type)
class Contiguous(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op == "contiguous")
assert (len(args) == 1)
t = args[0]
self.shape = t['shape']
self.type = t['dtype']
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def flops(self):
return 0
def bytes(self):
return 2 * Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
class Any(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod == "Tensor")
assert (op == "any")
assert (len(args) == 1) #could be 2 as well, the second argument is a bool
t = args[0]
self.shape = t['shape']
self.type = t['dtype']
self.sub = d.sub
return
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def op(self):
return self.op_
def mod(self):
return self.mod_
def tc(self):
return "-"
def flops(self):
return 0
def bytes(self):
return Utility.numElems(self.shape) * Utility.typeToBytes(self.type)
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class BatchNorm(OperatorLayerBase):
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (op == "batch_norm")
assert (len(args) == 8)
i = args[0]
assert (i['type'] == "tensor")
self.shape = i['shape']
self.type = i['dtype']
self.dir = d.dir
def params(self):
p = OrderedDict([('T', self.shape), ('type', self.type)])
return p
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
def elems(self):
return Utility.numElems(self.shape)
def flops(self):
# Variance algo-dependent, but this is a reasonable value.
return self.elems() * 8
def bytes(self):
e = self.elems()
if self.dir == "fprop":
e *= 4
else:
e *= 5
return e * Utility.typeToBytes(self.type)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment