Unverified Commit 8a7a3325 authored by Masaki Kozuki's avatar Masaki Kozuki Committed by GitHub
Browse files

Remove `pyprof` and `reparameterization` (#1404)

* remove pyprof

* remove reparameterization

* remove pyprof test

* clean up
parent cd499737
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
class Foo(torch.nn.Module):
def __init__(self, size):
super(Foo, self).__init__()
self.n = torch.nn.Parameter(torch.ones(size))
self.m = torch.nn.Parameter(torch.ones(size))
def forward(self, input):
return self.n*input + self.m
foo = Foo(4)
foo.cuda()
x = torch.ones(4).cuda()
#JIT the class using tracing
traced_foo = torch.jit.trace(foo, x)
#Initialize pyprof after the JIT step
pyprof.nvtx.init()
#Assign a name to the object "traced_foo"
traced_foo.__dict__['__name__'] = "foo"
#Hook up the forward function to pyprof
pyprof.nvtx.wrap(traced_foo, 'forward')
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = traced_foo(x)
profiler.stop()
print(z)
#!/bin/bash
set -e
SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."
parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"
for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict
#NVprof
echo "nvprof -fo $sql python $f"
nvprof -fo $sql python $f
#Parse
echo $parse $sql
$parse $sql > $dict
#Prof
echo $prof $dict
$prof -w 130 $dict
\rm $sql $dict
done
#!/usr/bin/env python3
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.profiler as profiler
import torch.optim as optim
from apex import pyprof
pyprof.nvtx.init()
class LeNet5(nn.Module):
def __init__(self):
super(LeNet5, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
with torch.autograd.profiler.emit_nvtx():
net = LeNet5().cuda()
input = torch.randn(1, 1, 32, 32).cuda()
out = net(input)
target = torch.randn(10) # a dummy target, for example
target = target.view(1, -1).cuda() # make it the same shape as output
criterion = nn.MSELoss()
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)
# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
profiler.start()
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # Does the update
profiler.stop()
#!/usr/bin/env python3
"""
This file checks all Python operators.
"""
import sys
import torch
import torch.cuda.profiler as profiler
import operator
import inspect
#Import and initialize pyprof
from apex import pyprof
pyprof.nvtx.init()
X = 1024
Y = 1024
fa = torch.rand(X, Y).cuda()
fb = torch.rand(X, Y).cuda()
fc = torch.rand(X, Y).cuda()
ia = torch.randint(0, 100, (X, Y)).cuda()
ib = torch.randint(0, 100, (X, Y)).cuda()
sa = torch.ones(1,1).cuda()
sb = torch.ones(1,1).cuda()
ba = fa.byte()
unaryOps = ["abs", "__abs__", "neg", "__neg__",]
invertOps = ["inv", "invert", "__inv__", "__invert__",] #imlemented only for byte tensors
#pos, __pos__ is not implemented for tensors
binaryOps = []
binaryOps += [ "lt", "__lt__", "le", "__le__", "eq", "__eq__", "ne", "__ne__", "ge", "__ge__", "gt", "__gt__" ]
binaryOps += [ "add", "__add__", "sub", "__sub__", "mul", "__mul__", "floordiv", "__floordiv__", "truediv", "__truediv__", "pow", "__pow__", "mod", "__mod__"]
binaryOps += [ "and_", "__and__", "or_", "__or__", "xor", "__xor__", "lshift", "__lshift__", "rshift", "__rshift__"]
inplaceOps = []
inplaceOps += ["iadd", "__iadd__", "isub", "__isub__", "imul", "__imul__", "ifloordiv", "__ifloordiv__", "itruediv", "__itruediv__", "imod", "__imod__",]
#ipow, __ipow__ is not implemented in pytorch
inplaceOps += [ "iand", "__iand__", "ior", "__ior__", "ixor", "__ixor__", "ilshift", "__ilshift__", "irshift", "__irshift__",]
matmulOps = [ "matmul", "__matmul__" ]
inplacematmulOps = [ "imatmul", "__imatmul__" ]
reverseIntBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rfloordiv__", "__rpow__",]
reverseFloatBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__",]
'''
TODO
.concat(a, b)
.__concat__(a, b)
.contains(a, b)
.__contains__(a, b)
.countOf(a, b)
.delitem(a, b)
.__delitem__(a, b)
.getitem(a, b)
.__getitem__(a, b)
.indexOf(a, b)
.setitem(a, b, c)
.__setitem__(a, b, c)
.length_hint(obj, default=0)
.iconcat(a, b)
.__iconcat__(a, b)
.index(a)
.__index__(a)
'''
#Context manager
with torch.autograd.profiler.emit_nvtx():
#Start profiler
profiler.start()
for op in unaryOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(ia)
for op in invertOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(ba)
for op in binaryOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(ia, ib)
c = f(ia, 2)
for op in inplaceOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
ia = f(ia, ib)
ia = f(ia, 2)
for op in matmulOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(fa, fb)
for op in inplacematmulOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
fa = f(fa, fb)
for op in reverseIntBinaryOps:
assert hasattr(torch.Tensor, op)
f = getattr(torch.Tensor, op)
ia = f(ia, ib)
for op in reverseFloatBinaryOps:
assert hasattr(torch.Tensor, op)
f = getattr(torch.Tensor, op)
fa = f(fa, fb)
'''
#c = fa[3]
#c = fa[3][3]
#c = torch.min(fa, 3)
c = torch.sum(fa)
c = torch.max(fa)
c = -fa
#fc[2][2] = fa[2][2]
c = a_scalar and b_scalar
c = a_scalar or b_scalar
c = not a_scalar
c = a is b
c = a is not b
'''
#Stop profiler
profiler.stop()
#!/usr/bin/env python3
"""
This simple file provides an example of how to
- import the pyprof library and initialize it
- use the emit_nvtx context manager
- start and stop the profiler
Only kernels within profiler.start and profiler.stop calls are profiled.
To profile
$ nvprof -f -o simple.sql --profile-from-start off ./simple.py
"""
import sys
import torch
import torch.cuda.profiler as profiler
#Import and initialize pyprof
from apex import pyprof
pyprof.nvtx.init()
a = torch.randn(5, 5).cuda()
b = torch.randn(5, 5).cuda()
#Context manager
with torch.autograd.profiler.emit_nvtx():
#Start profiler
profiler.start()
c = a + b
c = torch.mul(a,b)
c = torch.matmul(a,b)
c = torch.argmax(a, dim=1)
c = torch.nn.functional.pad(a, (1,1))
#Stop profiler
profiler.stop()
Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm)
are a useful tool to capture and observe events and code ranges etc.
Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP).
While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option.
NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel.
The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks.
Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all.
### To run
```sh
nvprof -fo resnet.sql --profile-from-start off python resnet.py
parse.py resnet.sql > resnet.dict
prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict
```
The file `resnet.sql` can also be opened with NVVP as usual.
#!/usr/bin/env python3
"""
An example showing use of nested NVTX markers.
"""
import torch
import torch.nn as nn
import torch.cuda.profiler as profiler
import torch.cuda.nvtx as nvtx
from apex import pyprof
pyprof.nvtx.init()
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class Bottleneck(nn.Module):
expansion = 4
count = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
self.id = Bottleneck.count
Bottleneck.count += 1
def forward(self, x):
identity = x
nvtx.range_push("layer:Bottleneck_{}".format(self.id))
nvtx.range_push("layer:Conv1")
out = self.conv1(x)
nvtx.range_pop()
nvtx.range_push("layer:BN1")
out = self.bn1(out)
nvtx.range_pop()
nvtx.range_push("layer:ReLU")
out = self.relu(out)
nvtx.range_pop()
nvtx.range_push("layer:Conv2")
out = self.conv2(out)
nvtx.range_pop()
nvtx.range_push("layer:BN2")
out = self.bn2(out)
nvtx.range_pop()
nvtx.range_push("layer:ReLU")
out = self.relu(out)
nvtx.range_pop()
nvtx.range_push("layer:Conv3")
out = self.conv3(out)
nvtx.range_pop()
nvtx.range_push("layer:BN3")
out = self.bn3(out)
nvtx.range_pop()
if self.downsample is not None:
nvtx.range_push("layer:Downsample")
identity = self.downsample(x)
nvtx.range_pop()
nvtx.range_push("layer:Residual")
out += identity
nvtx.range_pop()
nvtx.range_push("layer:ReLU")
out = self.relu(out)
nvtx.range_pop()
nvtx.range_pop()
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000,
groups=1, width_per_group=64, norm_layer=None):
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.inplanes = 64
self.dilation = 1
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def forward(self, x):
nvtx.range_push("layer:conv1_x")
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
nvtx.range_pop()
nvtx.range_push("layer:conv2_x")
x = self.layer1(x)
nvtx.range_pop()
nvtx.range_push("layer:conv3_x")
x = self.layer2(x)
nvtx.range_pop()
nvtx.range_push("layer:conv4_x")
x = self.layer3(x)
nvtx.range_pop()
nvtx.range_push("layer:conv5_x")
x = self.layer4(x)
nvtx.range_pop()
x = self.avgpool(x)
x = torch.flatten(x, 1)
nvtx.range_push("layer:FC")
x = self.fc(x)
nvtx.range_pop()
return x
def resnet50():
return ResNet(Bottleneck, [3, 4, 6, 3])
#Create model
net = resnet50().cuda().half()
net.train()
#Create optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
#Create synthetic input and label
x = torch.rand(32, 3, 224, 224).cuda().half()
target = torch.empty(32, dtype=torch.long).random_(1000).cuda()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
output = net(x)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
profiler.stop()
#!/bin/bash
set -e
SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."
parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"
for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict
#NVprof
echo "nvprof -fo --profile-from-start off $sql python $f"
nvprof -fo $sql --profile-from-start off python $f
#Parse
echo $parse $sql
$parse $sql > $dict
#Prof
echo $prof $dict
#$prof -w 130 $dict
$prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict
\rm $sql $dict
done
from .nvmarker import init
from .nvmarker import add_wrapper as wrap
"""
This file intercepts (monkey patches) the following functions and adds NVTX markers.
torch.*
torch.Tensor.*
torch.nn.functional.*
torch.nn.*.forward
The NVTX markers (one or more) contain the following information
call trace (a list of file_name:line_number)
extra_repr() from torch.nn modules
module/class name
function name
inputs (args and kwargs)
scalar: name, type and value
tensor: name, shape and datatype
numpy: name, shape and datatype
list/tuple: a sequence of scalars or tensors or numpy arrays
"""
import torch
import torch.cuda.nvtx as nvtx
import numpy
import inspect as ins
import traceback
import math
def isfunc(mod, f):
assert hasattr(mod, f)
attr = getattr(mod, f)
#Ignore functions like _add
if (len(f) >= 2):
if f[0] == "_" and f[1] != "_":
return False
#Ignore functions from this list
ignore = ['__all__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__builtins__', '__cached__', '__class__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__file__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__index__', '__init__', '__init_subclass__', '__iter__', '__len__', '__loader__', '__module__', '__name__', '__new__', '__nonzero__', '__package__', '__path__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__spec__', '__str__', '__subclasshook__', '__version__', '__weakref__']
#Add functions to this list if they cause recursion
ignore += ['size', 'tolist', 'dim', 'is_storage', 'item']
if f in ignore:
return False
return ins.ismethod(attr) or ins.isfunction(attr) or ins.ismethoddescriptor(attr) or ins.isbuiltin(attr)
def traceMarker(stack):
d = {}
cadena = []
for i in range(len(stack)-1):
fi = stack[i]
t = "{}:{}".format(fi.filename, fi.lineno)
cadena.append(t)
d['traceMarker'] = cadena
return str(d)
def modMarker(mod, fn_name, args):
"""
Returns the stringified extra_repr() of a module.
"""
assert(fn_name == 'forward')
assert(len(args) > 0)
d = {}
d['mod'] = mod.__name__
d['strRepr'] = args[0].extra_repr()
return str(d)
def add_wrapper(mod, fn_name):
assert isfunc(mod, fn_name)
# Get a pointer to the original function
func = getattr(mod, fn_name)
# Check if the mod has a string representation
# and is not a Script or Traced module (used by JIT)
s = hasattr(mod, "extra_repr") and (type(mod) is not torch.jit.ScriptModule) and (type(mod) is not torch.jit.TopLevelTracedModule)
def wrapper_func(*args, **kwargs):
# Extract the stacktrace
stack = traceback.extract_stack()
# Push trace marker
nvtx.range_push(traceMarker(stack))
# Push module marker
if s:
m = modMarker(mod, fn_name, args)
nvtx.range_push(m)
# Create and push argument marker
cadena = argMarker(mod, fn_name, args, kwargs)
nvtx.range_push(cadena)
# Call the original function
result = func(*args, **kwargs)
# Pop argumet marker
nvtx.range_pop()
# Pop module marker
if s:
nvtx.range_pop()
# Pop trace marker
nvtx.range_pop()
return result
setattr(mod, fn_name, wrapper_func)
def argMarker(mod, op, args, kwargs):
#For this function args is a tuple and kwargs is a dict
def tensor(arg, name=""):
a = {}
a['name'] = name
a['type'] = "tensor"
a['shape'] = tuple(arg.size())
a['dtype'] = str(arg.dtype).split(".")[-1]
cadena['args'].append(a)
def ndarray(arg, name=""):
a = {}
a['name'] = name
a['type'] = "ndarray"
a['shape'] = arg.shape
a['dtype'] = str(arg.dtype).split(".")[-1]
cadena['args'].append(a)
def seq(arg, name=""):
assert issequence(arg)
a = {}
a['name'] = name
if isinstance(arg, list):
a['type'] = "list"
a['value'] = arg
else:
a['type'] = "tuple"
# The arg could be torch.Size, which is a subclass of tuple
# Therefore, explicitly convert to tuple
a['value'] = tuple(arg)
cadena['args'].append(a)
def scalar(arg, name=""):
a = {}
a['name'] = name
a['type'] = type(arg).__name__
#handle the case when the argument is +/- inf or nan
if arg == float('inf'):
a['value'] = "inf"
elif arg == float('-inf'):
a['value'] = "-inf"
elif isinstance(arg, float) and math.isnan(arg):
a['value'] = "nan"
else:
a['value'] = arg
cadena['args'].append(a)
def isscalar(arg):
return (type(arg) is int) or (type(arg) is float) or (type(arg) is bool) or (arg is None) or (type(arg) is str)
def issequence(arg):
return isinstance(arg, list) or isinstance(arg, tuple)
def foo(args, name):
#args should be an iterable sequence e.g. list or tuple
for arg in args:
if isinstance(arg, torch.Tensor):
if arg.dim() == 0:
scalar(arg.item(), name)
else:
tensor(arg, name)
elif isinstance(arg, numpy.ndarray):
ndarray(arg, name)
elif (isscalar(arg)):
scalar(arg, name)
elif issequence(arg):
if (len(arg) == 0) or isscalar(arg[0]): #An empty sequence or a sequence of scalars
seq(arg, name)
else: # A sequence of tensors or numpy arrays
foo(arg, name)
'''
else:
print("The following arg is none of Tensor, numpy array, scalar but a %s" % (str(type(arg))))
print("Mod: %s" % str(mod.__name__))
print("Op: %s" % str(op))
print(dir(arg))
'''
cadena = {}
cadena['mod'] = mod.__name__
cadena['op'] = op
cadena['args'] = []
foo(args, "")
for k,v in kwargs.items():
foo((v,), k)
return str(cadena)
def patchClass(cls):
for f in dir(cls):
if isfunc(cls, f):
add_wrapper(cls, f)
def init():
string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
"soon be removed from Apex. Please visit\n" + \
"https://github.com/NVIDIA/PyProf\n" + \
"for the latest version.\n\n"
# print regardless of warning state
print(string)
print("Initializing NVTX monkey patches")
for cls in [torch, torch.Tensor, torch.nn.functional,]:
patchClass(cls)
for cls in [torch.nn.RNN, torch.nn.RNNCell, torch.nn.LSTM, torch.nn.LSTMCell, torch.nn.GRU, torch.nn.GRUCell]:
if isfunc(cls, 'forward'):
add_wrapper(cls, 'forward')
print("Done with NVTX monkey patching")
import warnings
try:
from .parse import main
except ImportError as e:
warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)")
raise e
if __name__ == '__main__':
main()
import sys, sqlite3
class DB(object):
"""
This class provides functions for DB operations
with exception handling.
"""
def __init__(self, dbFile):
try:
conn = sqlite3.connect(dbFile)
conn.row_factory = sqlite3.Row
c = conn.cursor()
except:
print("Error opening {}".format(dbFile))
sys.exit(1)
self.conn = conn
self.c = c
def select(self, cmd):
try:
self.c.execute(cmd)
#rows = self.c.fetchall()
rows = [dict(row) for row in self.c.fetchall()]
except sqlite3.Error as e:
print(e)
sys.exit(1)
except:
print("Uncaught error in SQLite access while executing {}".format(cmd))
sys.exit(1)
#print(rows)
return rows
def insert(self, cmd, data):
try:
self.c.execute(cmd, data)
except sqlite3.Error as e:
print(e)
sys.exit(1)
except:
print("Uncaught error in SQLite access while executing {}".format(cmd))
sys.exit(1)
def execute(self, cmd):
try:
self.c.execute(cmd)
except sqlite3.Error as e:
print(e)
sys.exit(1)
except:
print("Uncaught error in SQLite access while executing {}".format(cmd))
sys.exit(1)
def commit(self):
self.conn.commit()
def close(self):
self.c.close()
self.conn.close()
import cxxfilt, struct, binascii
#Helper functions
def demangle(name):
"""
Demangle a C++ string
"""
return cxxfilt.demangle(name)
def encode_object_id(pid, tid):
"""
Given process id (pid) and thread id (tid), return the object id.
object id = pid (little endian 4 bytes) + tid (little endian 8 bytes)
"""
objId = struct.pack('<i', pid) + struct.pack('<q',tid)
objId = binascii.hexlify(objId).decode('ascii').upper()
return objId
def getShortName(name):
"""
Returns a shorter kernel name
"""
sname = name.split("<")[0] \
.replace("void ", "") \
.replace("at::","") \
.replace("cuda::", "") \
.replace("native::","") \
.replace("(anonymous namespace)::", "")
sname = sname.split("(")[0]
return sname
class Kernel(object):
"""
This class stores information about a kernel.
"""
kernels = []
profStart = 0
def __init__(self):
self.kNameId = None
self.kShortName = None
self.kLongName = None
self.kStartTime = None #GPU start time
self.kEndTime = None #GPU end time
self.kDuration = None
self.device = None
self.stream = None
self.grid = ()
self.block = ()
self.corrId = None
self.rStartTime = None #CPU start time
self.rEndTime = None #CPU end time
self.rDuration = None
self.tid = None
self.pid = None
self.objId = None
self.timeOffset = None
self.layerMarkers = []
self.traceMarkers = []
self.reprMarkers = []
self.pyprofMarkers = []
self.seqMarkers = []
self.otherMarkers = []
self.altMarkers = []
self.seqId = []
self.altSeqId = []
self.layer = []
self.subSeqId = None
self.dir = None
self.mod = []
self.op = []
def setKernelInfo(self, info):
self.kNameId = info['name']
self.corrId = int(info['correlationId'])
start = int(info['start'])
end = int(info['end'])
assert end > start, "This assertion can fail for very large profiles. It usually fails when start = end = 0."
self.kStartTime = start
self.kEndTime = end
self.kDuration = end - start
assert (start > Kernel.profStart)
self.device = int(info['deviceId'])
self.stream = int(info['streamId'])
self.grid = (info['gridX'], info['gridY'], info['gridZ'])
self.block = (info['blockX'], info['blockY'], info['blockZ'])
self.timeOffset = Kernel.profStart
def setKernelName(self, name):
cadena = demangle(name)
self.kLongName = cadena
self.kShortName = getShortName(cadena)
def setRunTimeInfo(self, info):
start, end, pid, tid = info
self.rStartTime = start
self.rEndTime = end
self.rDuration = end - start
self.pid = pid
self.tid = tid
self.objId = encode_object_id(pid, tid)
def setMarkerInfo(self, info):
self.layerMarkers, self.traceMarkers, self.reprMarkers, self.pyprofMarkers, self.seqMarkers, self.otherMarkers, self.altMarkers, self.seqId, self.altSeqId, self.layer = info
self.subSeqId = 0
def setDirection(self):
"""
Set direction (fprop, bprop) based on PyTorch sequence markers.
It is a heuristic and not a foolproof method.
"""
if any("Backward, seq = " in x for x in self.seqMarkers) or \
any("backward, seq = " in x for x in self.seqMarkers) or \
any("Backward0, seq = " in x for x in self.seqMarkers):
self.dir = "bprop"
else:
self.dir = "fprop"
def setOp(self):
"""
Detect and set the class/module (mod) and operation (op)
of the kernel e.g. torch.nn.functional / linear, torch / sigmoid.
The lookup sequence we use is
NVTX markers inserted by pyprof
NVTX markers inserted by PyTorch in bprop
NVTX markers inserted by PyTorch in fprop
It is a heuristic and not a foolproof method.
"""
def sanitize(name):
name = name.replace("torch","") \
.replace("autograd","") \
.replace("_backward","") \
.replace("::","") \
.replace("jit","") \
.replace("(anonymous namespace)","")
head, sep, tail = name.partition("Backward")
return head
#Check pyprof markers
for m in self.pyprofMarkers:
assert ("mod" in m) and ("op" in m) and ("args" in m)
t = eval(m)
self.op.append(t['op'])
self.mod.append(t['mod'])
if len(self.op):
return
#Check bprop kernel markers
for m in self.seqMarkers:
if ("backward, seq = " in m) or ("Backward, seq = " in m):
op = m.split(",")[0]
op = sanitize(op)
self.op.append(op)
self.mod.append('na')
if len(self.op):
return
#Check markers with "seq = "
for m in self.seqMarkers:
if ", seq = " in m:
op = m.split(",")[0]
self.op.append(op)
self.mod.append('na')
if len(self.op):
return
#If nothing else
if len(self.otherMarkers):
self.op.append(self.otherMarkers[0])
self.mod.append('na')
def print(self):
"""
Print kernel information. This is used by prof.py.
"""
a = lambda: None
a.kShortName = self.kShortName
a.kDuration = self.kDuration
#a.layerMarkers = self.layerMarkers
a.layer = self.layer
a.trace = self.traceMarkers
a.reprMarkers = self.reprMarkers
a.marker = self.pyprofMarkers
a.seqMarker = self.seqMarkers
a.seqId = self.seqId
a.subSeqId = self.subSeqId
a.altSeqId = self.altSeqId
a.dir = self.dir
a.mod = self.mod
a.op = self.op
a.tid = self.tid
a.device = self.device
a.stream = self.stream
a.grid = self.grid
a.block = self.block
a.kLongName = self.kLongName
print(a.__dict__)
import sys
class NVVP(object):
"""
This class gets kernel information from the SQL (nvvp) database.
"""
driverT = "CUPTI_ACTIVITY_KIND_DRIVER"
runtimeT = "CUPTI_ACTIVITY_KIND_RUNTIME"
kernelT = "CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL"
markerT = "CUPTI_ACTIVITY_KIND_MARKER"
stringT = "StringTable"
def __init__(self, db):
self.db = db
self.markerId = 0
def getProfileStart(self):
"""
Get the profile start time
"""
profStart = sys.maxsize
for table in [self.driverT, self.runtimeT, self.kernelT, self.markerT]:
colname = "timestamp" if table is self.markerT else "start"
cmd = "select {} from {} ORDER BY {} ASC LIMIT 1".format(colname, table, colname)
result = self.db.select(cmd)
assert(len(result) <= 1)
if (len(result) == 1):
assert(colname in result[0])
t = result[0][colname]
if (t < profStart):
profStart = t
assert(profStart < sys.maxsize)
return profStart
def getString(self, id_):
"""
Get the string associated with an id.
"""
cmd = "select value from {} where _id_ = {}".format(self.stringT, id_)
result = self.db.select(cmd)
assert (len(result) == 1)
return result[0]['value']
def createMarkerTable(self):
"""
Create a temporary table and index it to speed up repeated SQL quesries.
The table is an INNER JOIN of CUPTI_ACTIVITY_KIND_MARKER with itself.
"""
cmd = 'CREATE TEMPORARY TABLE marker AS SELECT \
a._id_ as id, \
a.timestamp AS startTime, \
b.timestamp AS endTime, \
HEX(a.objectId) AS objectId, \
a.name AS name \
FROM {} AS a INNER JOIN {} AS b ON \
a.id = b.id and \
a.flags = 2 and b.flags = 4'.format(self.markerT, self.markerT)
self.db.execute(cmd)
self.db.execute('CREATE INDEX start_index ON marker (startTime)')
self.db.execute('CREATE INDEX end_index ON marker (endTime)')
self.db.execute('CREATE INDEX id_index ON marker (id)')
def getCPUInfo(self, corrId):
"""
Given the correlation id, get CPU start, end, thread id, process id.
The information can be in the runtime table or the driver table.
"""
#First look in the runtime table
cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.runtimeT, corrId);
result = self.db.select(cmd)
assert (len(result) <= 1)
if (len(result) == 0):
#Look in the driver table
cmd = "select start,end,processId,threadId from {} where correlationId={}".format(self.driverT, corrId);
result = self.db.select(cmd)
assert (len(result) == 1)
info = result[0]
start = info['start']
end = info['end']
pid = info['processId']
tid = info['threadId']
tid = tid & 0xffffffff #convert to unsigned
assert (end > start)
return [start, end, pid, tid]
def getKernelInfo(self):
"""
Get GPU kernel info
"""
cmd = "select name,correlationId,start,end,deviceId,streamId,gridX,gridY,gridZ,blockX,blockY,blockZ from {}".format(self.kernelT)
result = self.db.select(cmd)
return result
def getMarkerInfo(self, objId, startTime, endTime):
"""
This function first finds all NVTX markers encapsulating
a runtime / driver kernel launch.
It then splits the markers into many lists.
layerMarkers : User added NVTX markers
traceMarkers : Call trace markers (inserted by pyprof)
reprMarkers : Markers containing the extra_repr() of a module (inserted by pyprof)
pyprofMarkers: Markers containing args and kwargs (tensor shape, datatype etc.)
seqMarkers : Markers containing PyTorch internal sequence markers (inserted by PyTorch)
altSeqMarkers: Markers inserted by PyTorch between two kernel launches. Needs better explanation.
otherMarkers : Markers not in either of the above categories.
We extract seqId from the seq and altSeq markers. The seqId is used in bprop.
We also extract information from the layerMarkers.
"""
layerMarkers = []
traceMarkers = []
reprMarkers = []
pyprofMarkers = []
seqMarkers = []
otherMarkers = []
altSeqMarkers = []
bprop = False
#Helper functions
def delete(objId, sTime):
"""
Delete rows from the temporary SQL table which are no longer required.
This speeds up future queries.
"""
margin = 0
cmd = 'DELETE FROM marker WHERE objectId = "{}" AND endTime < {}'.format(objId, sTime - margin)
#cmd = 'DELETE FROM marker WHERE endTime < {}'.format(sTime - margin)
self.db.execute(cmd)
def getLayerName(mlist):
"""
Get layer names from layer marker list.
"""
layers = []
assert(type(mlist) == list)
for m in mlist:
assert("layer:" in m)
l = m.split(":")[1]
layers.append(l)
return layers
def getSeqId(mlist):
"""
Get sequence ids from seq / alt seq marker list.
"""
ids = []
assert(type(mlist) == list)
for m in mlist:
assert(", seq = " in m)
seq = int(m.split("=")[1])
ids.append(seq)
#Remove duplicates
ids = list(set(ids))
ids.sort()
return ids
def seqcompare(elem):
"""
Sorting function for sequence markers
"""
assert (", seq = " in elem)
#sort by sequence id and then the string
l = elem.split(" = ")
return l[1] + l[0]
def prune(mlist):
"""
Remove markers with the same seqId and if the strings are similar.
This function works on a sorted sequence.
"""
assert (type(mlist) == list)
assert (len(mlist))
a = mlist[0:1]
for i in range(1,len(mlist)):
m = mlist[i]
pm = mlist[i-1]
name,seq = m.split(",")
pname,pseq = pm.split(",")
similar = (name in pname) or (pname in name)
if (seq == pseq) and similar:
continue
else:
a.append(m)
return a
def filterTrace(mlist):
"""
Filter trace markers to remove certain file names.
"""
assert (type(mlist) == list)
if len(mlist) == 0:
return mlist
mlist = mlist[-1] #The last stack trace will be a super set.
mlist = eval(mlist)
mlist = mlist['traceMarker']
assert (type(mlist) == list)
mlist = list(filter(lambda x : "/torch/nn/modules/" not in x, mlist))
mlist = list(filter(lambda x : "/torch/nn/functional.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/tensor.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/autograd/__init__.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/_jit_internal.py" not in x, mlist))
mlist = list(filter(lambda x : "/pyprof/nvtx/nvmarker.py" not in x, mlist))
mlist = list(filter(lambda x : "/apex/optimizers/" not in x, mlist))
mlist = list(filter(lambda x : "/torch/_utils.py" not in x, mlist))
mlist = list(filter(lambda x : "/torch/optim/" not in x, mlist))
return mlist
#Find all encapsulating markers
cmd = 'SELECT id,name from marker where \
objectId = "{}" and \
startTime < {} and \
endTime > {} \
ORDER BY startTime ASC'.format(objId, startTime, endTime)
result = self.db.select(cmd)
#Bin markers into different lists
for r in result:
m = self.getString(r['name'])
#Hack: If its a known gradient checkpointing marker, ignore it.
if m.find("CheckpointFunctionBackward") >= 0:
continue
if ("_backward, seq =" in m) or ("Backward, seq =" in m) or ("Backward0, seq =" in m):
bprop = True
if ("mod" in m) and ("op" in m) and ("args" in m) and ("type" in m):
pyprofMarkers.append(m)
elif ("layer:" in m):
layerMarkers.append(m)
elif ("traceMarker" in m):
traceMarkers.append(m)
elif ("strRepr" in m):
reprMarkers.append(m)
elif (", seq = " in m):
seqMarkers.append(m)
else:
otherMarkers.append(m)
#Remove duplicates, sort and prune seqMarkers
if (len(seqMarkers)):
seqMarkers = list(set(seqMarkers))
seqMarkers.sort(key=seqcompare)
seqMarkers = prune(seqMarkers)
#Remove duplicates from otherMarkers
otherMarkers = list(set(otherMarkers))
#Get markers with seq id (inserted by PyTorch) from the previous kernel to the present kernel
#Only for fprop kernels
if (len(result) and not bprop):
loId = self.markerId
hiId = result[-1]['id']
self.markerId = hiId
#Get markers between loId and hiId
cmd = 'SELECT id,name from marker where objectId = "{}" and id > {} and id < {} ORDER BY startTime ASC'.format(objId, loId, hiId)
result1 = self.db.select(cmd)
for r in result1:
m = self.getString(r['name'])
#Get only markers with seq id
if (", seq=" in m):
altSeqMarkers.append(m)
#Remove duplicates, sort and prune altSeqMarkers
if (len(altSeqMarkers)):
altSeqMarkers = list(set(altSeqMarkers))
altSeqMarkers.sort(key=seqcompare)
altSeqMarkers = prune(altSeqMarkers)
delete(objId, startTime)
return layerMarkers, filterTrace(traceMarkers), reprMarkers, pyprofMarkers, seqMarkers, otherMarkers, altSeqMarkers, getSeqId(seqMarkers), getSeqId(altSeqMarkers), getLayerName(layerMarkers)
#!/usr/bin/env python3
"""
Parse the SQL db and print a dictionary for every kernel.
"""
import sys
import argparse
from tqdm import tqdm
from .db import DB
from .kernel import Kernel
from .nvvp import NVVP
def parseArgs():
parser = argparse.ArgumentParser(prog=sys.argv[0], description="Parse SQL (nvvp) db.")
parser.add_argument("file",
type=str,
default=None,
help="SQL db (nvvp) file.")
args = parser.parse_args()
return args
def main():
args = parseArgs()
db = DB(args.file)
nvvp = NVVP(db)
kInfo = nvvp.getKernelInfo()
if len(kInfo) == 0:
print("Found 0 kernels. Exiting.", file=sys.stderr)
db.close()
sys.exit(0)
else:
print("Found {} kernels. Getting info for each kernel.".format(len(kInfo)), file=sys.stderr)
nvvp.createMarkerTable()
prevSeqId = -1
prevSubSeqId = -1
prevOp = "na"
Kernel.profStart = nvvp.getProfileStart()
for i in tqdm(range(len(kInfo)), ascii=True):
info = kInfo[i]
k = Kernel()
#Set kernel info
k.setKernelInfo(info)
#Get, set kernel name
name = nvvp.getString(k.kNameId)
k.setKernelName(name)
#Get runtime info
info = nvvp.getCPUInfo(k.corrId)
k.setRunTimeInfo(info)
#Get and set marker and seqid info
info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime)
k.setMarkerInfo(info)
#If the seqId contains both 0 and non zero integers, remove 0.
if any(seq != 0 for seq in k.seqId) and (0 in k.seqId):
k.seqId.remove(0)
#Set direction (it uses seq id)
k.setDirection()
#Set op
k.setOp()
#The following code is based on heuristics.
#TODO: Refactor.
#Assign subSeqId, adjust seqId and altSeqId
#seqId can be 0.
#A kernel can have multiple seqIds both in fprop and bprop.
#In bprop, seqIds might not decrease monotonically. I have observed a few blips.
if len(k.seqId):
assert (k.dir in ["fprop", "bprop"])
if (k.dir == "fprop"):
#Check if there is a sequence id larger than the previous
inc = (k.seqId[-1] > prevSeqId)
if inc:
currSeqId = [x for x in k.seqId if x > prevSeqId][0]
else:
currSeqId = prevSeqId
else:
currSeqId = k.seqId[0]
#if ((currSeqId == prevSeqId) and (k.op == prevOp)):
if ((currSeqId == prevSeqId) and (k.op == prevOp)) or ((k.op[0] == "forward") and (k.op == prevOp) and (k.mod[0] in ["LSTMCell", "GRUCell", "RNNCell"])):
#The second condition is to trap cases when pytorch does not use cudnn for a LSTMCell.
k.subSeqId = prevSubSeqId + 1
prevSeqId = currSeqId
prevSubSeqId = k.subSeqId
prevOp = k.op
#Keep currSeqId in k.seqId, move everything else to k.altSeqId
for s in k.seqId:
if s != currSeqId:
k.seqId.remove(s)
k.altSeqId.append(s)
for s in k.altSeqId:
if s == currSeqId:
k.altSeqId.remove(s)
k.altSeqId = list(set(k.altSeqId))
if (len(k.altSeqId)):
(k.altSeqId).sort()
k.print()
db.close()
if __name__ == '__main__':
main()
import warnings
try:
from .prof import main
except ImportError as e:
warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?")
raise e
if __name__ == '__main__':
main()
from collections import OrderedDict
from .utility import Utility
from .base import OperatorLayerBase
class Activation(OperatorLayerBase):
"""
This class handles the various activation functions.
"""
ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"]
def __init__(self, d):
marker = eval(d.argMarker[0])
mod = marker['mod']
op = marker['op']
args = marker['args']
self.marker = marker
self.mod_ = mod
self.op_ = op
self.args = args
assert (mod in ["torch.nn.functional", "torch", "Tensor"])
#Filter out named parameters
args = list(filter(lambda x : x['name'] == '', args))
assert (len(args) >= 1)
arg = args[0]
assert (arg['type'] == "tensor")
self.i = arg
self.dir = d.dir
def params(self):
p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])])
return p
def flops(self):
direction = self.dir
tensor = self.i['shape']
t = self.i['dtype']
# TODO: revise
elems = Utility.numElems(tensor)
return elems
def bytes(self):
direction = self.dir
tensor = self.i['shape']
t = self.i['dtype']
elems = Utility.numElems(tensor)
elems = elems * (2 if direction == "fprop" else 3)
return elems * Utility.typeToBytes(t)
def tc(self):
return "-"
def op(self):
return self.op_
def mod(self):
return self.mod_
from abc import ABC, abstractmethod
class OperatorLayerBase(ABC):
"""
Base class for all layers and operators.
Every derived class should have the following functions.
"""
@abstractmethod
def tc(self):
"""
Tensor core usage by the kernel.
Return "1" (yes), "0" (no, but possible), "-" (not applicable)
"""
pass
@abstractmethod
def params(self):
"""
Kernel parameters to be printed.
"""
pass
@abstractmethod
def flops(self):
"""
Note that 1 FMA = 2 flops.
"""
pass
@abstractmethod
def bytes(self):
pass
@abstractmethod
def mod(self):
"""
Name of the module/class e.g. torch.nn.functional.
"""
pass
@abstractmethod
def op(self):
"""
Name of the operator e.g. sigmoid.
"""
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment