Commit aee5aff4 authored by Deyu Fu's avatar Deyu Fu
Browse files

Merge branch 'master' into deyuf/fused_optimizer_v2

parents 007c5947 880ab925
import sys
import argparse
def parseArgs():
"""
Print usage and parse arguments.
"""
def check_cols(value):
valid = ["idx", "seq", "altseq", "tid", "layer", "trace", "dir", "sub", "mod", "op", "kernel", "params", "sil", "tc", "device", "stream", "grid", "block", "flops", "bytes"]
cols = value.split(",")
for col in cols:
if col not in valid:
raise argparse.ArgumentTypeError("{} is not a valid column name. Valid column names are {}.".format(col, ",".join(valid)))
return cols
def openFile(f):
try:
d = open(f, "r")
return d
except IOError:
print("Error opening file {}. Exiting.".format(f), file=sys.stderr)
sys.exit(1)
parser = argparse.ArgumentParser(prog=sys.argv[0], description="PyTorch Profiler", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("file",
nargs='?',
type=str,
default=None,
help="Output of parse.py (Python dictionary).")
parser.add_argument("-c",
type=check_cols,
default="idx,dir,sub,mod,op,kernel,params,sil",
help='''Comma seperated names of columns to print.
idx: Index
seq: PyTorch Sequence Id
altseq: PyTorch Alternate Sequence Id
tid: Thread Id
layer: User annotated NVTX string (can be nested)
trace: Function Call Trace
dir: Direction
sub: Sub Sequence Id
mod: Module
op: Operattion
kernel: Kernel Name
params: Parameters
sil: Silicon Time (in ns)
tc: Tensor Core Usage
device: GPU Device Id
stream: Stream Id
grid: Grid Dimensions
block: Block Dimensions
flops: Floating point ops (FMA = 2 FLOPs)
bytes: Number of bytes in and out of DRAM
e.g. -c idx,kernel,sil''')
group = parser.add_mutually_exclusive_group()
group.add_argument("--csv",
action="store_true",
default=False,
help="Print a CSV output.")
group.add_argument("-w",
type=int,
default=0,
help="Width of columnated output.")
args = parser.parse_args()
if args.file is None:
args.file = sys.stdin
else:
args.file = openFile(args.file)
return args
from functools import reduce
class Utility(object):
@staticmethod
def numElems(shape):
assert (type(shape) == tuple)
return reduce(lambda x,y: x*y, shape, 1)
@staticmethod
def typeToBytes(t):
if (t in ["uint8", "int8", "byte", "char"]):
return 1
elif (t in ["float16", "half", "int16", "short"]):
return 2
elif (t in ["float32", "float", "int32", "int"]):
return 4
elif (t in ["int64", "long", "float64", "double"]):
return 8
assert False
@staticmethod
def typeToString(t):
if (t in ["uint8", "byte", "char"]):
return "uint8"
elif (t in ["int8",]):
return "int8"
elif (t in ["int16", "short",]):
return "int16"
elif (t in ["float16", "half"]):
return "fp16"
elif (t in ["float32", "float"]):
return "fp32"
elif (t in ["int32", "int",]):
return "int32"
elif (t in ["int64", "long"]):
return "int64"
elif (t in ["float64", "double",]):
return "fp64"
assert False
@staticmethod
def hasNVTX(marker):
if type(marker) is str:
try:
marker = eval(marker)
except:
return False
if type(marker) is dict:
keys = marker.keys()
return ("mod" in keys) and ("op" in keys) and ("args" in keys)
else:
return False
@staticmethod
def isscalar(t):
return (t in ["float", "int"])
......@@ -795,11 +795,13 @@ void cuda_layer_norm_gradient(
invvar->data<accscalar_t>(),
input,
n1,n2,
gamma->data<scalar_t_0>(),
beta->data<scalar_t_0>(),
// TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
// if gamma Tensor is NULL on input.
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
gamma != NULL ? beta->data<scalar_t_0>() : NULL,
epsilon,
grad_input->data<scalar_t_0>(),
grad_gamma->data<scalar_t_0>(),
grad_beta->data<scalar_t_0>());
gamma != NULL ? grad_gamma->data<scalar_t_0>() : NULL,
gamma != NULL ? grad_beta->data<scalar_t_0>() : NULL);
)
}
Under construction...
# Mixed Precision DCGAN Training in PyTorch
`main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/dcgan](https://github.com/pytorch/examples/tree/master/dcgan).
It implements Automatic Mixed Precision (Amp) training of the DCGAN example for different datasets. Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s. For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
We introduce these changes to the PyTorch DCGAN example as described in the [Multiple models/optimizers/losses](https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses) section of the documentation::
```
# Added after models and optimizers construction
[netD, netG], [optimizerD, optimizerG] = amp.initialize(
[netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
...
# loss.backward() changed to:
with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
errD_real_scaled.backward()
...
with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
errD_fake_scaled.backward()
...
with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
errG_scaled.backward()
```
Note that we use different `loss_scalers` for each computed loss.
Using a separate loss scaler per loss is [optional, not required](https://nvidia.github.io/apex/advanced.html#optionally-have-amp-use-a-different-loss-scaler-per-loss).
To improve the numerical stability, we swapped `nn.Sigmoid() + nn.BCELoss()` to `nn.BCEWithLogitsLoss()`.
With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
"Pure FP32" training:
```
$ python main_amp.py --opt-level O0
```
Recommended mixed precision training:
```
$ python main_amp.py --opt-level O1
```
Have a look at the original [DCGAN example](https://github.com/pytorch/examples/tree/master/dcgan) for more information about the used arguments.
To enable mixed precision training, we introduce the `--opt-level` argument.
from __future__ import print_function
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='cifar10', help='cifar10 | lsun | mnist |imagenet | folder | lfw | fake')
parser.add_argument('--dataroot', default='./', help='path to dataset')
parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)
parser.add_argument('--batchSize', type=int, default=64, help='input batch size')
parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector')
parser.add_argument('--ngf', type=int, default=64)
parser.add_argument('--ndf', type=int, default=64)
parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for')
parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
parser.add_argument('--netG', default='', help="path to netG (to continue training)")
parser.add_argument('--netD', default='', help="path to netD (to continue training)")
parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
parser.add_argument('--manualSeed', type=int, help='manual seed')
parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
parser.add_argument('--opt_level', default='O1', help='amp opt_level, default="O1"')
opt = parser.parse_args()
print(opt)
try:
os.makedirs(opt.outf)
except OSError:
pass
if opt.manualSeed is None:
opt.manualSeed = 2809
print("Random Seed: ", opt.manualSeed)
random.seed(opt.manualSeed)
torch.manual_seed(opt.manualSeed)
cudnn.benchmark = True
if opt.dataset in ['imagenet', 'folder', 'lfw']:
# folder dataset
dataset = dset.ImageFolder(root=opt.dataroot,
transform=transforms.Compose([
transforms.Resize(opt.imageSize),
transforms.CenterCrop(opt.imageSize),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]))
nc=3
elif opt.dataset == 'lsun':
classes = [ c + '_train' for c in opt.classes.split(',')]
dataset = dset.LSUN(root=opt.dataroot, classes=classes,
transform=transforms.Compose([
transforms.Resize(opt.imageSize),
transforms.CenterCrop(opt.imageSize),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]))
nc=3
elif opt.dataset == 'cifar10':
dataset = dset.CIFAR10(root=opt.dataroot, download=True,
transform=transforms.Compose([
transforms.Resize(opt.imageSize),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]))
nc=3
elif opt.dataset == 'mnist':
dataset = dset.MNIST(root=opt.dataroot, download=True,
transform=transforms.Compose([
transforms.Resize(opt.imageSize),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,)),
]))
nc=1
elif opt.dataset == 'fake':
dataset = dset.FakeData(image_size=(3, opt.imageSize, opt.imageSize),
transform=transforms.ToTensor())
nc=3
assert dataset
dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
shuffle=True, num_workers=int(opt.workers))
device = torch.device("cuda:0")
ngpu = int(opt.ngpu)
nz = int(opt.nz)
ngf = int(opt.ngf)
ndf = int(opt.ndf)
# custom weights initialization called on netG and netD
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
class Generator(nn.Module):
def __init__(self, ngpu):
super(Generator, self).__init__()
self.ngpu = ngpu
self.main = nn.Sequential(
# input is Z, going into a convolution
nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# state size. (ngf*8) x 4 x 4
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# state size. (ngf*4) x 8 x 8
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
# state size. (ngf*2) x 16 x 16
nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# state size. (ngf) x 32 x 32
nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
nn.Tanh()
# state size. (nc) x 64 x 64
)
def forward(self, input):
if input.is_cuda and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
return output
netG = Generator(ngpu).to(device)
netG.apply(weights_init)
if opt.netG != '':
netG.load_state_dict(torch.load(opt.netG))
print(netG)
class Discriminator(nn.Module):
def __init__(self, ngpu):
super(Discriminator, self).__init__()
self.ngpu = ngpu
self.main = nn.Sequential(
# input is (nc) x 64 x 64
nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf) x 32 x 32
nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*2) x 16 x 16
nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*4) x 8 x 8
nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*8) x 4 x 4
nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
)
def forward(self, input):
if input.is_cuda and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
return output.view(-1, 1).squeeze(1)
netD = Discriminator(ngpu).to(device)
netD.apply(weights_init)
if opt.netD != '':
netD.load_state_dict(torch.load(opt.netD))
print(netD)
criterion = nn.BCEWithLogitsLoss()
fixed_noise = torch.randn(opt.batchSize, nz, 1, 1, device=device)
real_label = 1
fake_label = 0
# setup optimizer
optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
[netD, netG], [optimizerD, optimizerG] = amp.initialize(
[netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
for epoch in range(opt.niter):
for i, data in enumerate(dataloader, 0):
############################
# (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
###########################
# train with real
netD.zero_grad()
real_cpu = data[0].to(device)
batch_size = real_cpu.size(0)
label = torch.full((batch_size,), real_label, device=device)
output = netD(real_cpu)
errD_real = criterion(output, label)
with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
errD_real_scaled.backward()
D_x = output.mean().item()
# train with fake
noise = torch.randn(batch_size, nz, 1, 1, device=device)
fake = netG(noise)
label.fill_(fake_label)
output = netD(fake.detach())
errD_fake = criterion(output, label)
with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
errD_fake_scaled.backward()
D_G_z1 = output.mean().item()
errD = errD_real + errD_fake
optimizerD.step()
############################
# (2) Update G network: maximize log(D(G(z)))
###########################
netG.zero_grad()
label.fill_(real_label) # fake labels are real for generator cost
output = netD(fake)
errG = criterion(output, label)
with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
errG_scaled.backward()
D_G_z2 = output.mean().item()
optimizerG.step()
print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
% (epoch, opt.niter, i, len(dataloader),
errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
if i % 100 == 0:
vutils.save_image(real_cpu,
'%s/real_samples.png' % opt.outf,
normalize=True)
fake = netG(fixed_noise)
vutils.save_image(fake.detach(),
'%s/amp_fake_samples_epoch_%03d.png' % (opt.outf, epoch),
normalize=True)
# do checkpointing
torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))
cxxfilt>=0.2.0
tqdm>=4.28.1
numpy>=1.15.3
PyYAML>=5.1
pytest>=3.5.1
......@@ -2,7 +2,9 @@ import torch
from setuptools import setup, find_packages
import subprocess
from pip._internal import main as pipmain
import sys
import warnings
if not torch.cuda.is_available():
print("\nWarning: Torch did not find available GPUs on this system.\n",
......@@ -19,6 +21,17 @@ if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
cmdclass = {}
ext_modules = []
if "--pyprof" in sys.argv:
with open('requirements.txt') as f:
required_packages = f.read().splitlines()
pipmain(["install"] + required_packages)
try:
sys.argv.remove("--pyprof")
except:
pass
else:
warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")
if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
if TORCH_MAJOR == 0:
raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
......
......@@ -4,38 +4,39 @@ import random
import torch
import apex
from torch.autograd import Variable
class TestFusedLayerNorm(unittest.TestCase):
def setUp(self):
self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=False)
self.input_ = torch.randn(16, 32, 64)
# bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cpu()
self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cuda()
def _test_same_output(self, batch_size):
torch.cuda.manual_seed(42)
def forward_cpu(self, input_):
self.module.cpu()
return self.module(input_.cpu())
def forward_cuda(self, input_):
self.module.cuda()
return self.module(input_.cuda())
def test_forward_cuda(self):
out_ = self.forward_cuda(self.input_)
assert out_.is_cuda == True
def test_forward_cpu(self):
out_ = self.forward_cpu(self.input_)
assert out_.is_cuda == False
def test_same_output(self):
out_cpu = self.forward_cpu(self.input_)
out_cuda = self.forward_cuda(self.input_)
torch.testing.assert_allclose(out_cpu, out_cuda.cpu())
self.input_ = torch.randn((batch_size, *self.module_cpu_.normalized_shape), device="cpu").requires_grad_(True)
self.input_cuda_ = self.input_.cuda().detach().requires_grad_(True)
out_cpu_ = self.module_cpu_(self.input_)
gO = torch.rand_like(out_cpu_)
out_cpu_.backward(gO)
out_cuda_ = self.module_cuda_(self.input_cuda_)
gO = gO.cuda()
out_cuda_.backward(gO)
assert out_cpu_.is_cuda == False
assert out_cuda_.is_cuda == True
torch.testing.assert_allclose(out_cpu_, out_cuda_.cpu())
torch.testing.assert_allclose(self.input_.grad, self.input_cuda_.grad.cpu())
def test_layer_norm(self):
self._test_same_output(16)
def test_large_batch(self):
self._test_same_output(65536)
class TestFusedLayerNormElemWise(TestFusedLayerNorm):
def setUp(self):
self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=True)
self.input_ = torch.randn(16, 32, 64)
torch.cuda.manual_seed(42)
\ No newline at end of file
self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()
import test_pyprof_nvtx.TestPyProfNvtx as TestPyProfNvtx
import inspect
import os
import torch
import torch.nn.functional as F
import unittest
from apex import pyprof
pyprof.nvtx.init()
# TODO: add tests for:
# F.bilinear, F.l1_loss, F.multilabel_soft_margin_loss, F.multi_margin_loss
class TestPyProfNvtx(unittest.TestCase):
def __init__(self, testName, dtype=torch.float16):
super().__init__(testName)
self.dtype = dtype
def setUp(self):
pass
def tearDown(self):
pass
def test_conv1d(self):
# Data and weight tensors
tensor1d_in_conv = torch.randn(32, 3, 224, device='cuda', dtype=self.dtype)
tensor1d_in_conv_grouped = torch.randn(32, 6, 224, device='cuda', dtype=self.dtype)
conv1d_filter = torch.randn(16, 3, 3, device='cuda', dtype=self.dtype)
conv1d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv1d
conv1d_out_vanilla = F.conv1d(tensor1d_in_conv, conv1d_filter)
# conv1d with bias
conv1d_out_with_bias = F.conv1d(tensor1d_in_conv, conv1d_filter, bias=conv1d_bias)
# conv1d - stride > 1
conv1d_out_strided = F.conv1d(tensor1d_in_conv, conv1d_filter, stride=2)
# conv1d - dilation > 1
conv1d_out_dilated = F.conv1d(tensor1d_in_conv, conv1d_filter, dilation=2)
# conv1d - groups > 1
conv1d_out_grouped = F.conv1d(tensor1d_in_conv_grouped, conv1d_filter, groups=2)
# conv1d - padding with zeros
conv1d_out_padding_zeros = F.conv1d(tensor1d_in_conv, conv1d_filter, padding=6)
def test_conv2d(self):
# Data and weight tensors
tensor2d_in_conv = torch.randn(32, 3, 224, 224, device='cuda', dtype=self.dtype)
tensor2d_in_conv_grouped = torch.randn(32, 6, 224, 224, device='cuda', dtype=self.dtype)
conv2d_filter = torch.randn(16, 3, 3, 3, device='cuda', dtype=self.dtype)
conv2d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv2d
conv2d_out_vanilla = F.conv2d(tensor2d_in_conv, conv2d_filter)
# conv2d with bias
conv2d_with_bias = F.conv2d(tensor2d_in_conv, conv2d_filter, bias=conv2d_bias)
# conv2d - stride > 1
conv2d_out_strided = F.conv2d(tensor2d_in_conv, conv2d_filter, stride=2)
# conv2d - dilation > 1
conv2d_out_dilated = F.conv2d(tensor2d_in_conv, conv2d_filter, dilation=2)
# conv2d - groups > 1
conv2d_out_grouped = F.conv2d(tensor2d_in_conv_grouped, conv2d_filter, groups=2)
# conv2d - padding with zeros
conv2d_out_padding_zeros = F.conv2d(tensor2d_in_conv, conv2d_filter, padding=6)
def test_conv3d(self):
# Data and weight tensors
tensor3d_in_conv = torch.randn(32, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
tensor3d_in_conv_grouped = torch.randn(32, 6, 16, 224, 224, device='cuda', dtype=self.dtype)
conv3d_filter = torch.randn(16, 3, 3, 3, 3, device='cuda', dtype=self.dtype)
conv3d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv3d
conv3d_out_vanilla = F.conv3d(tensor3d_in_conv, conv3d_filter)
# conv3d - stride > 1
conv3d_out_strided = F.conv3d(tensor3d_in_conv, conv3d_filter, stride=2)
# conv3d - dilation > 1
conv3d_out_dilated = F.conv3d(tensor3d_in_conv, conv3d_filter, dilation=2)
# conv3d - groups > 1
conv3d_out_grouped = F.conv3d(tensor3d_in_conv_grouped, conv3d_filter, groups=2)
# conv3d - padding with zeros
conv3d_out_padding_zeros = F.conv3d(tensor3d_in_conv, conv3d_filter, padding=6)
def test_conv_transpose1d(self):
# Data and weight tensors
conv_transpose1d_tensor = torch.randn(64, 16, 64, device='cuda', dtype=self.dtype)
conv_transpose1d_filter = torch.randn(16, 32, 3, device='cuda', dtype=self.dtype)
conv_transpose1d_bias = torch.randn(32, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose1d_out = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter)
conv_transpose1d_out_biased = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, bias=conv_transpose1d_bias)
conv_transpose1d_out_strided = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, stride=2)
conv_transpose1d_out_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, padding=3)
conv_transpose1d_out2_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, output_padding=2, dilation=3)
conv_transpose1d_out_grouped = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, groups=2)
conv_transpose1d_out_dilated = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, dilation=2)
def test_conv_transpose2d(self):
# Data and weight tensors
conv_transpose2d_tensor = torch.randn(64, 8, 5, 5, device='cuda', dtype=self.dtype)
conv_transpose2d_filter = torch.randn(8, 16, 3, 3, device='cuda', dtype=self.dtype)
conv_transpose2d_bias = torch.randn(16, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose2d_out = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter)
conv_transpose2d_out_biased = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, bias=conv_transpose2d_bias)
conv_transpose2d_out_strided = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, stride=2)
conv_transpose2d_out_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, padding=3)
conv_transpose2d_out2_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, output_padding=2, dilation=3)
conv_transpose2d_out_grouped = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, groups=2)
conv_transpose2d_out_dilated = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, dilation=2)
def test_conv_transpose3d(self):
# Data and weight tensors
conv_transpose3d_tensor = torch.randn(20, 16, 50, 10, 20, device='cuda', dtype=self.dtype)
conv_transpose3d_filter = torch.randn(16, 33, 3, 3, 3, device='cuda', dtype=self.dtype)
conv_transpose3d_bias = torch.randn(33, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose3d_out = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter)
conv_transpose3d_out_biased = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, bias=conv_transpose3d_bias)
conv_transpose3d_out_strided = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, stride=2)
conv_transpose3d_out_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, padding=3)
conv_transpose3d_out2_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, output_padding=2, dilation=3)
conv_transpose3d_out_grouped = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, groups=2)
conv_transpose3d_out_dilated = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, dilation=2)
def test_unfold(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
kernel_size = (4, 5)
inp_unf_dilated = F.unfold(inp, kernel_size, dilation=2)
inp_unf_padded = F.unfold(inp, kernel_size, padding=2)
inp_unf_strided = F.unfold(inp, kernel_size, stride=2)
def test_fold(self):
inp = torch.randn(3, 20, 20, device='cuda', dtype=self.dtype)
inp_folded = F.fold(inp, (4, 5), (1, 1))
def test_avg_pool1d(self):
inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
out = F.avg_pool1d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_avg_pool2d(self):
inp = torch.randn(1, 3, 224, 224, device='cuda', dtype=self.dtype)
out = F.avg_pool2d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_avg_pool3d(self):
inp = torch.randn(1, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
out = F.avg_pool3d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_adaptive_avg_pool1d(self):
inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool1d(inp, output_size=5)
def test_adaptive_avg_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool2d(inp, output_size=5)
def test_adaptive_avg_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool3d(inp, output_size=5)
def test_max_pool1d(self):
inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
out = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_max_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_max_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_adaptive_max_pool1d(self):
inp = torch.randn(1, 16, 28, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool1d(inp, output_size=5, return_indices=True)
def test_adaptive_max_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool2d(inp, output_size=5, return_indices=True)
def test_adaptive_max_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool3d(inp, output_size=5, return_indices=True)
def test_max_unpool1d(self):
inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool1d(output, indices, kernel_size=2, stride=2, padding=2)
def test_max_unpool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool2d(output, indices, kernel_size=2, stride=2, padding=2)
def test_max_unpool3d(self):
inp = torch.randn(1, 16, 8, 32, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool3d(output, indices, kernel_size=2, stride=2, padding=2)
def test_lp_pool1d(self):
inp = torch.randn(1, 32, 64, device='cuda', dtype=self.dtype)
output = F.lp_pool1d(inp, 2, 3, stride=2, ceil_mode=True)
def test_lp_pool2d(self):
#torch.nn.LPPool2d(norm_type, kernel_size, stride=None, ceil_mode=False)
inp = torch.randn(1, 32, 64, 64, device='cuda', dtype=self.dtype)
output = F.lp_pool2d(inp, 2, 3, stride=2, ceil_mode=True)
def test_threshold(self):
inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
output = F.threshold(inp, 6, 6, inplace=False)
def test_threshold_(self):
inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
output = F.threshold_(inp, 6, 6)
def test_relu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu(inp, inplace=False)
def test_relu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu_(inp)
def test_hardtanh(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardtanh(inp, min_val=-1., max_val=1., inplace=False)
def test_hardtanh_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardtanh_(inp, min_val=-1., max_val=1.)
def test_relu6(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu6(inp, inplace=False)
def test_elu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.elu(inp, alpha=1.0, inplace=False)
def test_elu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.elu_(inp, alpha=1.0)
def test_selu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.selu(inp)
def test_celu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.celu(inp, alpha=1.0, inplace=False)
def test_leaky_relu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.leaky_relu(inp, negative_slope=0.01, inplace=False)
def test_leaky_relu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.leaky_relu_(inp, negative_slope=0.01)
def test_prelu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
weight = torch.randn(1, device='cuda', dtype=self.dtype)
output = F.prelu(inp, weight)
def test_rrelu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.rrelu(inp, lower=1./8, upper=1./3, training=False, inplace=False)
def test_rrelu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.rrelu(inp, lower=1./8, upper=1./3, training=False)
def test_glu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.glu(inp, dim=-1)
def test_logsigmoid(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.logsigmoid(inp)
def test_hardshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardshrink(inp, lambd=0.5)
def test_tanhshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.tanhshrink(inp)
def test_softsign(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softsign(inp)
def test_softplus(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softplus(inp, beta=1, threshold=20)
def test_softmin(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.softmin(inp, dim=1, _stacklevel=3, dtype=self.dtype)
def test_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.softmax(inp, dim=1, _stacklevel=3, dtype=self.dtype)
def test_softshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softshrink(inp, lambd=0.5)
def test_gumbel_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.gumbel_softmax(inp, tau=1, hard=False, eps=1e-10, dim=-1)
def test_log_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.log_softmax(inp, dim=-1, _stacklevel=3)
def test_tanh(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = torch.tanh(inp)
def test_sigmoid(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = torch.sigmoid(inp)
def test_batch_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
# running_mean, running_var
running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
running_var = torch.randn(3, device='cuda', dtype=self.dtype)
output = F.batch_norm(inp, running_mean, running_var, weight=None, bias=None, training=False, momentum=0.1, eps=1e-05)
def test_instance_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
running_var = torch.randn(3, device='cuda', dtype=self.dtype)
output = F.instance_norm(inp, running_mean=running_mean, running_var=running_var, weight=None, bias=None, use_input_stats=True, momentum=0.1, eps=1e-05)
def test_layer_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.layer_norm(inp, inp.size()[1:], weight=None, bias=None, eps=1e-05)
def test_local_response_norm(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.local_response_norm(inp, 2, alpha=0.0001, beta=0.75, k=1.0)
def test_normalize(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.normalize(inp, p=2, dim=1, eps=1e-12, out=None)
def test_linear(self):
inp = torch.randn(32, 64, 128, device='cuda', dtype=self.dtype)
weight = torch.randn(256, 128, device='cuda', dtype=self.dtype)
output = F.linear(inp, weight, bias=None)
def test_dropout(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout(inp, p=0.5, training=True, inplace=False)
def test_alpha_dropout(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.alpha_dropout(inp, p=0.5, training=True, inplace=False)
def test_dropout2d(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout2d(inp, p=0.5, training=True, inplace=False)
def test_dropout3d(self):
inp = torch.randn(16, 8, 32, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout3d(inp, p=0.5, training=True, inplace=False)
def test_embedding(self):
pre_embed_dim = 1024
post_embed_dim = 32
inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')
weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
output = F.embedding(inp, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False)
def test_embedding_bag(self):
pre_embed_dim = 1024
post_embed_dim = 32
inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')
weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
output = F.embedding_bag(inp, weight, offsets=None, max_norm=None, norm_type=2,
scale_grad_by_freq=False, mode='mean', sparse=False)
def test_one_hot(self):
num_classes = 10
inp = torch.randint(0, num_classes, (128, 16), device='cuda')
output = F.one_hot(inp, num_classes=10)
def test_pairwise_distance(self):
inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
output = F.pairwise_distance(inp1, inp2, p=2.0, eps=1e-06, keepdim=False)
def test_cosine_similarity(self):
inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
output = F.cosine_similarity(inp1, inp2, dim=1, eps=1e-8)
def test_pdist(self):
# pdist is not implemented for fp16
inp = torch.randn(128, 128, device='cuda', dtype=torch.float32)
output = F.pdist(inp, p=2)
def test_binary_cross_entropy(self):
# binary_cross_entropy is not implemented for fp16
inp = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=False)
output = F.binary_cross_entropy(torch.sigmoid(inp), target)
def test_binary_cross_entropy_with_logits(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.empty_like(inp).random_(2)
output = F.binary_cross_entropy_with_logits(inp, target)
def test_poisson_nll_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.poisson_nll_loss(inp, target, log_input=True, full=False,
size_average=None, eps=1e-08, reduce=None, reduction='mean')
def test_cosine_embedding_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.cosine_embedding_loss(inp1, inp2, target, margin=0,
size_average=None, reduce=None, reduction='mean')
def test_cross_entropy(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 100, (32,), device='cuda', dtype=torch.long, requires_grad=False)
output = F.cross_entropy(inp, target, weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
def test_ctc_loss(self):
# force fp32 because _th_normal_ (used by next line is not supported for fp16)
log_probs = torch.randn(50, 16, 20, device='cuda', dtype=torch.float32).log_softmax(2).detach().requires_grad_()
targets = torch.randint(1, 20, (16, 30), device='cuda', dtype=torch.long)
input_lengths = torch.full((16,), 50, dtype=torch.long)
target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
def test_hinge_embedding_loss(self):
inp = torch.randn(128, 32, device='cuda', dtype=self.dtype)
target = torch.randint(0, 1, (32,), device='cuda') - 1
output = F.hinge_embedding_loss(inp, target, margin=1.0, size_average=None, reduce=None, reduction='mean')
def test_kl_div(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.kl_div(inp, target, size_average=None, reduce=None, reduction='batchmean')
def test_mse_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.mse_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_margin_ranking_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = (torch.randint(0, 1, (128,), device='cuda') - 1).type_as(inp1)
output = F.margin_ranking_loss(inp1, inp2, target, margin=0, size_average=None, reduce=None, reduction='mean')
def test_multilabel_margin_loss(self):
inp = torch.randn(1024, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 10, (1024,), dtype=torch.long, device='cuda')
output = F.multilabel_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_nll_loss(self):
inp = torch.randn(64, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 10, (64,), device='cuda', dtype=torch.long)
output = F.nll_loss(inp, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')
def test_smooth_l1_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.smooth_l1_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_soft_margin_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.soft_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_triplet_margin_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp3 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.triplet_margin_loss(inp1, inp2, inp3, margin=1.0, p=2,
eps=1e-06, swap=False, size_average=None, reduce=None, reduction='mean')
def test_pixel_shuffle(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = torch.nn.functional.pixel_shuffle(inp, 2)
def test_pad(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
pad = (3, 3)
output = F.pad(inp, pad, mode='constant', value=0)
def test_interpolate(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.interpolate(inp, size=None, scale_factor=2, mode='nearest', align_corners=None)
def test_grid_sample(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
grid = torch.randn(16, 32, 32, 2, device='cuda', dtype=self.dtype)
output = F.grid_sample(inp, grid, mode='bilinear', padding_mode='zeros')
def test_affine_grid(self):
theta = torch.randn(32, 2, 3, device='cuda', dtype=self.dtype)
size = (32, 8, 32, 32)
output = F.affine_grid(theta, size)
def run_tests(precision):
dummy = TestPyProfNvtx('test_affine_grid', None)
test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
print("Running tests for {}".format(precision))
suite = unittest.TestSuite()
for test_case in test_cases:
suite.addTest(TestPyProfNvtx(test_case, precision))
unittest.TextTestRunner().run(suite)
if __name__ == '__main__':
run_tests(torch.float32)
run_tests(torch.float16)
import unittest
import sys
test_dirs = ["run_amp", "run_fp16util", "run_fused_layer_norm", "run_optimizers"]
test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx"]
runner = unittest.TextTestRunner(verbosity=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment