Unverified Commit 96850dfa authored by Jithun Nair's avatar Jithun Nair Committed by GitHub
Browse files

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29
parents 87fc4125 cc5f83b5
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
#Initialize pyprof
pyprof.nvtx.init()
class Foo(torch.autograd.Function):
@staticmethod
def forward(ctx, in1, in2):
out = in1 + in2 #This could be a custom C/C++ function.
return out
@staticmethod
def backward(ctx, grad):
in1_grad = grad #This could be a custom C/C++ function.
in2_grad = grad #This could be a custom C/C++ function.
return in1_grad, in2_grad
#Hook the forward and backward functions to pyprof
pyprof.nvtx.wrap(Foo, 'forward')
pyprof.nvtx.wrap(Foo, 'backward')
foo = Foo.apply
x = torch.ones(4,4).cuda()
y = torch.ones(4,4).cuda()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = foo(x,y)
profiler.stop()
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
pyprof.nvtx.init()
class Foo(torch.nn.Module):
def __init__(self, size):
super(Foo, self).__init__()
self.n = torch.nn.Parameter(torch.ones(size))
self.m = torch.nn.Parameter(torch.ones(size))
def forward(self, input):
return self.n*input + self.m
#Hook the forward function to pyprof
pyprof.nvtx.wrap(Foo, 'forward')
foo = Foo(4)
foo.cuda()
x = torch.ones(4).cuda()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = foo(x)
profiler.stop()
#!/bin/bash
set -e
SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."
parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"
for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict
#NVprof
echo "nvprof -fo $sql python $f"
nvprof -fo $sql python $f
#Parse
echo $parse $sql
$parse $sql > $dict
#Prof
echo $prof $dict
$prof -w 130 $dict
\rm $sql $dict
done
#!/usr/bin/env python3
"""
Example to run pyprof with imagenet models.
"""
import sys
import torch
import torch.nn as nn
import torchvision.models as models
import torch.cuda.profiler as profiler
import argparse
from apex import pyprof
from apex.optimizers import FusedAdam
def parseArgs():
parser = argparse.ArgumentParser(prog=sys.argv[0], description="Run popular imagenet models.")
parser.add_argument("-m",
type=str,
default="resnet50",
choices=["alexnet", "densenet121", "densenet161", "densenet169", "densenet201", "googlenet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3", "mobilenet_v2", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", "wide_resnet50_2", "wide_resnet101_2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0", "squeezenet1_0", "squeezenet1_1", "vgg11", "vgg11_bn", "vgg13", "vgg13_bn", "vgg16", "vgg16_bn", "vgg19", "vgg19_bn", "inception_v3"],
help="Model.")
parser.add_argument("-b",
type=int,
default=32,
help="Batch size.")
parser.add_argument("-o",
type=str,
default="adam",
choices=["adam", "sgd"],
help="Optimizer.")
args = parser.parse_args()
return args
d = {
"alexnet": {'H': 224, 'W': 224, 'opts': {}},
"densenet121": {'H': 224, 'W': 224, 'opts': {}},
"densenet161": {'H': 224, 'W': 224, 'opts': {}},
"densenet169": {'H': 224, 'W': 224, 'opts': {}},
"densenet201": {'H': 224, 'W': 224, 'opts': {}},
"googlenet": {'H': 224, 'W': 224, 'opts': {'aux_logits': False}},
"mnasnet0_5": {'H': 224, 'W': 224, 'opts': {}},
"mnasnet0_75": {'H': 224, 'W': 224, 'opts': {}},
"mnasnet1_0": {'H': 224, 'W': 224, 'opts': {}},
"mnasnet1_3": {'H': 224, 'W': 224, 'opts': {}},
"mobilenet_v2": {'H': 224, 'W': 224, 'opts': {}},
"resnet18": {'H': 224, 'W': 224, 'opts': {}},
"resnet34": {'H': 224, 'W': 224, 'opts': {}},
"resnet50": {'H': 224, 'W': 224, 'opts': {}},
"resnet101": {'H': 224, 'W': 224, 'opts': {}},
"resnet152": {'H': 224, 'W': 224, 'opts': {}},
"resnext50_32x4d": {'H': 224, 'W': 224, 'opts': {}},
"resnext101_32x8d": {'H': 224, 'W': 224, 'opts': {}},
"wide_resnet50_2": {'H': 224, 'W': 224, 'opts': {}},
"wide_resnet101_2": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x0_5": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x1_0": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x1_5": {'H': 224, 'W': 224, 'opts': {}},
"shufflenet_v2_x2_0": {'H': 224, 'W': 224, 'opts': {}},
"squeezenet1_0": {'H': 224, 'W': 224, 'opts': {}},
"squeezenet1_1": {'H': 224, 'W': 224, 'opts': {}},
"vgg11": {'H': 224, 'W': 224, 'opts': {}},
"vgg11_bn": {'H': 224, 'W': 224, 'opts': {}},
"vgg13": {'H': 224, 'W': 224, 'opts': {}},
"vgg13_bn": {'H': 224, 'W': 224, 'opts': {}},
"vgg16": {'H': 224, 'W': 224, 'opts': {}},
"vgg16_bn": {'H': 224, 'W': 224, 'opts': {}},
"vgg19": {'H': 224, 'W': 224, 'opts': {}},
"vgg19_bn": {'H': 224, 'W': 224, 'opts': {}},
"inception_v3": {'H': 299, 'W': 299, 'opts': {'aux_logits': False}},
}
def main():
args = parseArgs()
pyprof.nvtx.init()
# pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
N = args.b
C = 3
H = d[args.m]['H']
W = d[args.m]['W']
opts = d[args.m]['opts']
classes = 1000
net = getattr(models, args.m)
net = net(**opts).cuda().half()
net.train()
x = torch.rand(N, C, H, W).cuda().half()
target = torch.empty(N, dtype=torch.long).random_(classes).cuda()
criterion = nn.CrossEntropyLoss().cuda()
if (args.o == "sgd"):
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
elif (args.o == "adam"):
optimizer = FusedAdam(net.parameters())
else:
assert False
#Warm up without profiler
for i in range(2):
output = net(x)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
output = net(x)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
profiler.stop()
if __name__ == "__main__":
main()
#!/bin/bash
set -e
SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."
parse="python -m apex.pyprof.parse"
prof="python -m apex.pyprof.prof"
for net in "resnet50"
do
for optim in adam sgd
do
for batch in 32 64
do
base="torchvision".$net.$optim.$batch
sql=$base.sql
dict=$base.dict
#NVprof
echo "nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch"
nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch
#Parse
echo $parse $sql
$parse $sql > $dict
#Prof
echo $prof $dict
$prof -w 130 $dict
# \rm $sql $dict
done
done
done
*As of this writing, these examples do not work
because of changes being proposed in PyTorch.*
There are two ways to use PyTorch JIT
- Scripting
- Tracing
In addition, we can JIT a
- Stand alone function
- Class / class method
This directory has an example for each of the 4 cases.
Intercepting (monkey patching) JITted code has a few extra steps,
which are explained through comments.
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
#The following creates an object "foo" of type ScriptModule
#The new object has a function called "forward"
@torch.jit.script
def foo(x, y):
return torch.sigmoid(x) + y
#Initialize pyprof after the JIT step
pyprof.nvtx.init()
#Assign a name to the object "foo"
foo.__name__ = "foo"
#Hook up the forward function to pyprof
pyprof.nvtx.wrap(foo, 'forward')
x = torch.zeros(4,4).cuda()
y = torch.ones(4,4).cuda()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = foo(x, y)
profiler.stop()
print(z)
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
class Foo(torch.jit.ScriptModule):
def __init__(self, size):
super(Foo, self).__init__()
self.n = torch.nn.Parameter(torch.ones(size))
self.m = torch.nn.Parameter(torch.ones(size))
@torch.jit.script_method
def forward(self, input):
return self.n*input + self.m
#Initialize pyprof after the JIT step
pyprof.nvtx.init()
#Hook up the forward function to pyprof
pyprof.nvtx.wrap(Foo, 'forward')
foo = Foo(4)
foo.cuda()
x = torch.ones(4).cuda()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = foo(x)
profiler.stop()
print(z)
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
def foo(x, y):
return torch.sigmoid(x) + y
x = torch.zeros(4,4).cuda()
y = torch.ones(4,4).cuda()
#JIT the function using tracing
#This returns an object of type ScriptModule with a forward method.
traced_foo = torch.jit.trace(foo, (x,y))
#Initialize pyprof after the JIT step
pyprof.nvtx.init()
#Assign a name to the object "traced_foo"
traced_foo.__dict__['__name__'] = "foo"
#Hook up the forward function to pyprof
pyprof.nvtx.wrap(traced_foo, 'forward')
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = traced_foo(x, y)
profiler.stop()
print(z)
#!/usr/bin/env python3
import torch
import torch.cuda.profiler as profiler
from apex import pyprof
class Foo(torch.nn.Module):
def __init__(self, size):
super(Foo, self).__init__()
self.n = torch.nn.Parameter(torch.ones(size))
self.m = torch.nn.Parameter(torch.ones(size))
def forward(self, input):
return self.n*input + self.m
foo = Foo(4)
foo.cuda()
x = torch.ones(4).cuda()
#JIT the class using tracing
traced_foo = torch.jit.trace(foo, x)
#Initialize pyprof after the JIT step
pyprof.nvtx.init()
#Assign a name to the object "traced_foo"
traced_foo.__dict__['__name__'] = "foo"
#Hook up the forward function to pyprof
pyprof.nvtx.wrap(traced_foo, 'forward')
with torch.autograd.profiler.emit_nvtx():
profiler.start()
z = traced_foo(x)
profiler.stop()
print(z)
#!/bin/bash
set -e
SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."
parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"
for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict
#NVprof
echo "nvprof -fo $sql python $f"
nvprof -fo $sql python $f
#Parse
echo $parse $sql
$parse $sql > $dict
#Prof
echo $prof $dict
$prof -w 130 $dict
\rm $sql $dict
done
#!/usr/bin/env python3
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.profiler as profiler
import torch.optim as optim
from apex import pyprof
pyprof.nvtx.init()
class LeNet5(nn.Module):
def __init__(self):
super(LeNet5, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
with torch.autograd.profiler.emit_nvtx():
net = LeNet5().cuda()
input = torch.randn(1, 1, 32, 32).cuda()
out = net(input)
target = torch.randn(10) # a dummy target, for example
target = target.view(1, -1).cuda() # make it the same shape as output
criterion = nn.MSELoss()
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)
# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
profiler.start()
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # Does the update
profiler.stop()
#!/usr/bin/env python3
"""
This file checks all Python operators.
"""
import sys
import torch
import torch.cuda.profiler as profiler
import operator
import inspect
#Import and initialize pyprof
from apex import pyprof
pyprof.nvtx.init()
X = 1024
Y = 1024
fa = torch.rand(X, Y).cuda()
fb = torch.rand(X, Y).cuda()
fc = torch.rand(X, Y).cuda()
ia = torch.randint(0, 100, (X, Y)).cuda()
ib = torch.randint(0, 100, (X, Y)).cuda()
sa = torch.ones(1,1).cuda()
sb = torch.ones(1,1).cuda()
ba = fa.byte()
unaryOps = ["abs", "__abs__", "neg", "__neg__",]
invertOps = ["inv", "invert", "__inv__", "__invert__",] #imlemented only for byte tensors
#pos, __pos__ is not implemented for tensors
binaryOps = []
binaryOps += [ "lt", "__lt__", "le", "__le__", "eq", "__eq__", "ne", "__ne__", "ge", "__ge__", "gt", "__gt__" ]
binaryOps += [ "add", "__add__", "sub", "__sub__", "mul", "__mul__", "floordiv", "__floordiv__", "truediv", "__truediv__", "pow", "__pow__", "mod", "__mod__"]
binaryOps += [ "and_", "__and__", "or_", "__or__", "xor", "__xor__", "lshift", "__lshift__", "rshift", "__rshift__"]
inplaceOps = []
inplaceOps += ["iadd", "__iadd__", "isub", "__isub__", "imul", "__imul__", "ifloordiv", "__ifloordiv__", "itruediv", "__itruediv__", "imod", "__imod__",]
#ipow, __ipow__ is not implemented in pytorch
inplaceOps += [ "iand", "__iand__", "ior", "__ior__", "ixor", "__ixor__", "ilshift", "__ilshift__", "irshift", "__irshift__",]
matmulOps = [ "matmul", "__matmul__" ]
inplacematmulOps = [ "imatmul", "__imatmul__" ]
reverseIntBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rfloordiv__", "__rpow__",]
reverseFloatBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__",]
'''
TODO
.concat(a, b)
.__concat__(a, b)
.contains(a, b)
.__contains__(a, b)
.countOf(a, b)
.delitem(a, b)
.__delitem__(a, b)
.getitem(a, b)
.__getitem__(a, b)
.indexOf(a, b)
.setitem(a, b, c)
.__setitem__(a, b, c)
.length_hint(obj, default=0)
.iconcat(a, b)
.__iconcat__(a, b)
.index(a)
.__index__(a)
'''
#Context manager
with torch.autograd.profiler.emit_nvtx():
#Start profiler
profiler.start()
for op in unaryOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(ia)
for op in invertOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(ba)
for op in binaryOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(ia, ib)
c = f(ia, 2)
for op in inplaceOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
ia = f(ia, ib)
ia = f(ia, 2)
for op in matmulOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
c = f(fa, fb)
for op in inplacematmulOps:
assert hasattr(operator, op)
f = getattr(operator, op)
assert inspect.isbuiltin(f)
fa = f(fa, fb)
for op in reverseIntBinaryOps:
assert hasattr(torch.Tensor, op)
f = getattr(torch.Tensor, op)
ia = f(ia, ib)
for op in reverseFloatBinaryOps:
assert hasattr(torch.Tensor, op)
f = getattr(torch.Tensor, op)
fa = f(fa, fb)
'''
#c = fa[3]
#c = fa[3][3]
#c = torch.min(fa, 3)
c = torch.sum(fa)
c = torch.max(fa)
c = -fa
#fc[2][2] = fa[2][2]
c = a_scalar and b_scalar
c = a_scalar or b_scalar
c = not a_scalar
c = a is b
c = a is not b
'''
#Stop profiler
profiler.stop()
#!/usr/bin/env python3
"""
This simple file provides an example of how to
- import the pyprof library and initialize it
- use the emit_nvtx context manager
- start and stop the profiler
Only kernels within profiler.start and profiler.stop calls are profiled.
To profile
$ nvprof -f -o simple.sql --profile-from-start off ./simple.py
"""
import sys
import torch
import torch.cuda.profiler as profiler
#Import and initialize pyprof
from apex import pyprof
pyprof.nvtx.init()
a = torch.randn(5, 5).cuda()
b = torch.randn(5, 5).cuda()
#Context manager
with torch.autograd.profiler.emit_nvtx():
#Start profiler
profiler.start()
c = a + b
c = torch.mul(a,b)
c = torch.matmul(a,b)
c = torch.argmax(a, dim=1)
c = torch.nn.functional.pad(a, (1,1))
#Stop profiler
profiler.stop()
Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm)
are a useful tool to capture and observe events and code ranges etc.
Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP).
While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option.
NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel.
The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks.
Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all.
### To run
```sh
nvprof -fo resnet.sql --profile-from-start off python resnet.py
parse.py resnet.sql > resnet.dict
prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict
```
The file `resnet.sql` can also be opened with NVVP as usual.
#!/usr/bin/env python3
"""
An example showing use of nested NVTX markers.
"""
import torch
import torch.nn as nn
import torch.cuda.profiler as profiler
import torch.cuda.nvtx as nvtx
from apex import pyprof
pyprof.nvtx.init()
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class Bottleneck(nn.Module):
expansion = 4
count = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
self.id = Bottleneck.count
Bottleneck.count += 1
def forward(self, x):
identity = x
nvtx.range_push("layer:Bottleneck_{}".format(self.id))
nvtx.range_push("layer:Conv1")
out = self.conv1(x)
nvtx.range_pop()
nvtx.range_push("layer:BN1")
out = self.bn1(out)
nvtx.range_pop()
nvtx.range_push("layer:ReLU")
out = self.relu(out)
nvtx.range_pop()
nvtx.range_push("layer:Conv2")
out = self.conv2(out)
nvtx.range_pop()
nvtx.range_push("layer:BN2")
out = self.bn2(out)
nvtx.range_pop()
nvtx.range_push("layer:ReLU")
out = self.relu(out)
nvtx.range_pop()
nvtx.range_push("layer:Conv3")
out = self.conv3(out)
nvtx.range_pop()
nvtx.range_push("layer:BN3")
out = self.bn3(out)
nvtx.range_pop()
if self.downsample is not None:
nvtx.range_push("layer:Downsample")
identity = self.downsample(x)
nvtx.range_pop()
nvtx.range_push("layer:Residual")
out += identity
nvtx.range_pop()
nvtx.range_push("layer:ReLU")
out = self.relu(out)
nvtx.range_pop()
nvtx.range_pop()
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000,
groups=1, width_per_group=64, norm_layer=None):
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.inplanes = 64
self.dilation = 1
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def forward(self, x):
nvtx.range_push("layer:conv1_x")
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
nvtx.range_pop()
nvtx.range_push("layer:conv2_x")
x = self.layer1(x)
nvtx.range_pop()
nvtx.range_push("layer:conv3_x")
x = self.layer2(x)
nvtx.range_pop()
nvtx.range_push("layer:conv4_x")
x = self.layer3(x)
nvtx.range_pop()
nvtx.range_push("layer:conv5_x")
x = self.layer4(x)
nvtx.range_pop()
x = self.avgpool(x)
x = torch.flatten(x, 1)
nvtx.range_push("layer:FC")
x = self.fc(x)
nvtx.range_pop()
return x
def resnet50():
return ResNet(Bottleneck, [3, 4, 6, 3])
#Create model
net = resnet50().cuda().half()
net.train()
#Create optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
#Create synthetic input and label
x = torch.rand(32, 3, 224, 224).cuda().half()
target = torch.empty(32, dtype=torch.long).random_(1000).cuda()
with torch.autograd.profiler.emit_nvtx():
profiler.start()
output = net(x)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
profiler.stop()
#!/bin/bash
set -e
SCRIPT=`realpath $0`
SCRIPTPATH=`dirname $SCRIPT`
PYPROF="$SCRIPTPATH/../.."
parse="python $PYPROF/parse/parse.py"
prof="python $PYPROF/prof/prof.py"
for f in *.py
do
base=`basename $f .py`
sql=$base.sql
dict=$base.dict
#NVprof
echo "nvprof -fo --profile-from-start off $sql python $f"
nvprof -fo $sql --profile-from-start off python $f
#Parse
echo $parse $sql
$parse $sql > $dict
#Prof
echo $prof $dict
#$prof -w 130 $dict
$prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict
\rm $sql $dict
done
from .nvmarker import init
from .nvmarker import add_wrapper as wrap
"""
This file intercepts (monkey patches) the following functions and adds NVTX markers.
torch.*
torch.Tensor.*
torch.nn.functional.*
torch.nn.*.forward
The NVTX markers (one or more) contain the following information
call trace (a list of file_name:line_number)
extra_repr() from torch.nn modules
module/class name
function name
inputs (args and kwargs)
scalar: name, type and value
tensor: name, shape and datatype
numpy: name, shape and datatype
list/tuple: a sequence of scalars or tensors or numpy arrays
"""
import torch
import torch.cuda.nvtx as nvtx
import numpy
import inspect as ins
import traceback
import math
def isfunc(mod, f):
assert hasattr(mod, f)
attr = getattr(mod, f)
#Ignore functions like _add
if (len(f) >= 2):
if f[0] == "_" and f[1] != "_":
return False
#Ignore functions from this list
ignore = ['__all__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__builtins__', '__cached__', '__class__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__file__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__index__', '__init__', '__init_subclass__', '__iter__', '__len__', '__loader__', '__module__', '__name__', '__new__', '__nonzero__', '__package__', '__path__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__spec__', '__str__', '__subclasshook__', '__version__', '__weakref__']
#Add functions to this list if they cause recursion
ignore += ['size', 'tolist', 'dim', 'is_storage', 'item']
if f in ignore:
return False
return ins.ismethod(attr) or ins.isfunction(attr) or ins.ismethoddescriptor(attr) or ins.isbuiltin(attr)
def traceMarker(stack):
d = {}
cadena = []
for i in range(len(stack)-1):
fi = stack[i]
t = "{}:{}".format(fi.filename, fi.lineno)
cadena.append(t)
d['traceMarker'] = cadena
return str(d)
def modMarker(mod, fn_name, args):
"""
Returns the stringified extra_repr() of a module.
"""
assert(fn_name == 'forward')
assert(len(args) > 0)
d = {}
d['mod'] = mod.__name__
d['strRepr'] = args[0].extra_repr()
return str(d)
def add_wrapper(mod, fn_name):
assert isfunc(mod, fn_name)
# Get a pointer to the original function
func = getattr(mod, fn_name)
# Check if the mod has a string representation
# and is not a Script or Traced module (used by JIT)
s = hasattr(mod, "extra_repr") and (type(mod) is not torch.jit.ScriptModule) and (type(mod) is not torch.jit.TopLevelTracedModule)
def wrapper_func(*args, **kwargs):
# Extract the stacktrace
stack = traceback.extract_stack()
# Push trace marker
nvtx.range_push(traceMarker(stack))
# Push module marker
if s:
m = modMarker(mod, fn_name, args)
nvtx.range_push(m)
# Create and push argument marker
cadena = argMarker(mod, fn_name, args, kwargs)
nvtx.range_push(cadena)
# Call the original function
result = func(*args, **kwargs)
# Pop argumet marker
nvtx.range_pop()
# Pop module marker
if s:
nvtx.range_pop()
# Pop trace marker
nvtx.range_pop()
return result
setattr(mod, fn_name, wrapper_func)
def argMarker(mod, op, args, kwargs):
#For this function args is a tuple and kwargs is a dict
def tensor(arg, name=""):
a = {}
a['name'] = name
a['type'] = "tensor"
a['shape'] = tuple(arg.size())
a['dtype'] = str(arg.dtype).split(".")[-1]
cadena['args'].append(a)
def ndarray(arg, name=""):
a = {}
a['name'] = name
a['type'] = "ndarray"
a['shape'] = arg.shape
a['dtype'] = str(arg.dtype).split(".")[-1]
cadena['args'].append(a)
def seq(arg, name=""):
assert issequence(arg)
a = {}
a['name'] = name
if isinstance(arg, list):
a['type'] = "list"
a['value'] = arg
else:
a['type'] = "tuple"
# The arg could be torch.Size, which is a subclass of tuple
# Therefore, explicitly convert to tuple
a['value'] = tuple(arg)
cadena['args'].append(a)
def scalar(arg, name=""):
a = {}
a['name'] = name
a['type'] = type(arg).__name__
#handle the case when the argument is +/- inf or nan
if arg == float('inf'):
a['value'] = "inf"
elif arg == float('-inf'):
a['value'] = "-inf"
elif isinstance(arg, float) and math.isnan(arg):
a['value'] = "nan"
else:
a['value'] = arg
cadena['args'].append(a)
def isscalar(arg):
return (type(arg) is int) or (type(arg) is float) or (type(arg) is bool) or (arg is None) or (type(arg) is str)
def issequence(arg):
return isinstance(arg, list) or isinstance(arg, tuple)
def foo(args, name):
#args should be an iterable sequence e.g. list or tuple
for arg in args:
if isinstance(arg, torch.Tensor):
if arg.dim() == 0:
scalar(arg.item(), name)
else:
tensor(arg, name)
elif isinstance(arg, numpy.ndarray):
ndarray(arg, name)
elif (isscalar(arg)):
scalar(arg, name)
elif issequence(arg):
if (len(arg) == 0) or isscalar(arg[0]): #An empty sequence or a sequence of scalars
seq(arg, name)
else: # A sequence of tensors or numpy arrays
foo(arg, name)
'''
else:
print("The following arg is none of Tensor, numpy array, scalar but a %s" % (str(type(arg))))
print("Mod: %s" % str(mod.__name__))
print("Op: %s" % str(op))
print(dir(arg))
'''
cadena = {}
cadena['mod'] = mod.__name__
cadena['op'] = op
cadena['args'] = []
foo(args, "")
for k,v in kwargs.items():
foo((v,), k)
return str(cadena)
def patchClass(cls):
for f in dir(cls):
if isfunc(cls, f):
add_wrapper(cls, f)
def init():
string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
"soon be removed from Apex. Please visit\n" + \
"https://github.com/NVIDIA/PyProf\n" + \
"for the latest version.\n\n"
# print regardless of warning state
print(string)
print("Initializing NVTX monkey patches")
for cls in [torch, torch.Tensor, torch.nn.functional,]:
patchClass(cls)
for cls in [torch.nn.RNN, torch.nn.RNNCell, torch.nn.LSTM, torch.nn.LSTMCell, torch.nn.GRU, torch.nn.GRUCell]:
if isfunc(cls, 'forward'):
add_wrapper(cls, 'forward')
print("Done with NVTX monkey patching")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment