Commit 843cdbe0 authored by Michael Carilli's avatar Michael Carilli
Browse files

Merging in master

parents 724672d7 28097c99
import unittest
import functools as ft
import itertools as it
from apex import amp
import torch
from torch import nn
import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
try:
import amp_C
from amp_C import multi_tensor_l2norm
from apex.multi_tensor_apply import MultiTensorApply
disabled = False
except ImportError as err:
print("amp_C fused kernels unavailable, disabling TestMultiTensorApply. ImportError was ", err)
disabled = True
class TestMultiTensorL2Norm(unittest.TestCase):
def setUp(self):
common_init(self)
self.val = 4.0
self.overflow_buf = torch.cuda.IntTensor(1).zero_()
def tearDown(self):
pass
# The tensor creation here is written for convenience, not speed.
def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type):
self.overflow_buf.zero_()
a = torch.cuda.FloatTensor(sizea).fill_(self.val)
b = torch.cuda.FloatTensor(sizeb).fill_(self.val)
in_list = []
for i in range(repeat_tensors):
in_list += [a.clone().to(in_type), b.clone().to(in_type)]
norm = applier(multi_tensor_l2norm, self.overflow_buf, [in_list])
reference = torch.cuda.FloatTensor((sizea + sizeb)*repeat_tensors).fill_(self.val).norm()
self.assertTrue(torch.allclose(norm, reference))
self.assertTrue(self.overflow_buf.item() == 0)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fuzz(self):
input_size_pairs = (
(7777*77, 555*555),
(777, 555),
(555, 2048*32+1),
(2048*32+1, 555),
(555, 2048*32),
(2048*32, 555),
(33333, 555),
(555, 33333))
appliers = (
MultiTensorApply(2048*32),
MultiTensorApply(333),
MultiTensorApply(33333))
repeat_tensors = (
1,
55)
for sizea, sizeb in input_size_pairs:
for applier in appliers:
for repeat in repeat_tensors:
for in_type in (torch.float32, torch.float16):
self.l2norm(sizea, sizeb, applier, repeat, in_type, )
if __name__ == '__main__':
unittest.main()
......@@ -24,12 +24,11 @@ except ImportError as err:
class TestMultiTensorScale(unittest.TestCase):
def setUp(self):
common_init(self)
self.scale = 4.0
self.overflow_buf = torch.cuda.IntTensor(1).zero_()
self.ref = torch.cuda.FloatTensor([1.0])
common_init(self)
def tearDown(self):
pass
......
import unittest
import functools as ft
import itertools as it
from apex import amp
from apex.amp import _amp_state
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Parameter
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
class MyModel(torch.nn.Module):
def __init__(self, unique):
super(MyModel, self).__init__()
self.weight0 = Parameter(unique +
torch.arange(2, device='cuda', dtype=torch.float32))
self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
@staticmethod
def ops(input, weight0, weight1):
return ((input*(weight0.float()))*(weight1.float())).sum()
def forward(self, input):
return self.ops(input, self.weight0, self.weight1)
# Abandon all hope, ye who enter here.
# This is hands down the ugliest code I have ever written, but it succeeds in testing
# multiple models/optimizers/losses fairly thoroughly. Many of the different test cases
# require slightly divergent code in a way that seems near-impossible to genericize into a simple
# cross product or nested loops.
class TestMultipleModelsOptimizersLosses(unittest.TestCase):
def setUp(self):
self.x = torch.ones((2), device='cuda', dtype=torch.float32)
common_init(self)
def tearDown(self):
pass
def test_2models2losses1optimizer(self):
model0 = MyModel(1)
model1 = MyModel(2)
optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 0.5}],
momentum=0.125)
reference_grads = []
for i in range(2):
optimizer.zero_grad()
loss0 = model0(self.x)
loss1 = model1(self.x)
loss0.backward()
loss1.backward()
reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
[param.grad.data.clone() for param in model1.parameters()])
optimizer.step()
final_params = [param.data.clone() for param in model0.parameters()] + \
[param.data.clone() for param in model1.parameters()]
for opt_level in ("O0", "O1", "O2", "O3"):
for how_to_zero in ("none", "model", "optimizer"):
for use_multiple_loss_scalers in (True, False):
if opt_level == "O1" or opt_level == "O2":
inject_inf_iters = (-1, 0, 1)
else:
inject_inf_iters = (-1,)
for inject_inf in inject_inf_iters:
if inject_inf >= 0:
inject_inf_locs = ("fp16", "fp32")
which_backwards = (0, 1)
else:
inject_inf_locs = ("fdsa",)
which_backwards = (None,)
for inject_inf_loc in inject_inf_locs:
for which_backward in which_backwards:
if use_multiple_loss_scalers:
num_losses = 2
loss_ids = [0, 1]
else:
num_losses = 1
loss_ids = [0, 0]
if inject_inf >= 0:
iters = 3
else:
iters = 2
model0 = MyModel(1)
model1 = MyModel(2)
models = [model0, model1]
optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 0.5}],
momentum=0.125)
_amp_state.allow_incoming_model_not_fp32 = True
[model0, model1], optimizer = amp.initialize(
[model0, model1],
optimizer,
opt_level=opt_level,
verbosity=0,
cast_model_type=False,
num_losses=num_losses)
_amp_state.allow_incoming_model_not_fp32 = False
_amp_state.loss_scalers[0]._loss_scale = 4.0
if use_multiple_loss_scalers:
_amp_state.loss_scalers[1]._loss_scale = 16.0
unskipped = 0
for i in range(iters):
if how_to_zero == "none":
for model in models:
for param in model.parameters():
param.grad = None
elif how_to_zero == "model":
for model in models:
model.zero_grad()
else:
optimizer.zero_grad()
loss0 = model0(self.x)
loss1 = model1(self.x)
with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 0:
if inject_inf_loc == "fp32":
model0.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
model0.weight1.grad[0] = float('inf')
with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 1:
if inject_inf_loc == "fp32":
model1.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
model1.weight1.grad[0] = float('inf')
if i != inject_inf:
for param, reference_grad in zip(amp.master_params(optimizer),
reference_grads[unskipped]):
self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
unskipped += 1
optimizer.step()
model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
for model, master, reference in zip(
model_params,
amp.master_params(optimizer),
final_params):
self.assertTrue(torch.allclose(model, reference))
self.assertTrue(torch.allclose(model, master.to(model.dtype)))
if opt_level == "O1":
_amp_state.handle._deactivate()
def test_3models2losses1optimizer(self):
model0 = MyModel(1)
model1 = MyModel(2)
model2 = MyModel(3)
optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 0.5},
{'params' : model2.parameters(), 'lr' : 0.125}],
momentum=0.125)
reference_grads = []
for i in range(2):
optimizer.zero_grad()
loss0 = model0(self.x) + model2(self.x)
loss1 = model1(self.x) + model2(self.x)
loss0.backward()
loss1.backward()
reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
[param.grad.data.clone() for param in model1.parameters()] +
[param.grad.data.clone() for param in model2.parameters()])
optimizer.step()
final_params = [param.data.clone() for param in model0.parameters()] + \
[param.data.clone() for param in model1.parameters()] + \
[param.data.clone() for param in model2.parameters()]
for opt_level in ("O0", "O1", "O2", "O3"):
for how_to_zero in ("none", "model", "optimizer"):
for use_multiple_loss_scalers in (True, False):
if opt_level == "O1" or opt_level == "O2":
inject_inf_iters = (-1, 0, 1)
else:
inject_inf_iters = (-1,)
for inject_inf in inject_inf_iters:
if inject_inf >= 0:
inject_inf_locs = ("fp16", "fp32")
which_backwards = (0, 1)
else:
inject_inf_locs = ("fdsa",)
which_backwards = (None,)
for inject_inf_loc in inject_inf_locs:
for which_backward in which_backwards:
if use_multiple_loss_scalers:
num_losses = 2
loss_ids = [0, 1]
else:
num_losses = 1
loss_ids = [0, 0]
if inject_inf >= 0:
iters = 3
if which_backward == 0:
which_models = (0, 2)
elif which_backward == 1:
which_models = (1, 2)
else:
iters = 2
which_models = (None,)
for which_model in which_models:
model0 = MyModel(1)
model1 = MyModel(2)
model2 = MyModel(3)
models = [model0, model1, model2]
optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 0.5},
{'params' : model2.parameters(), 'lr' : 0.125}],
momentum=0.125)
_amp_state.allow_incoming_model_not_fp32 = True
[model0, model1, model2], optimizer = amp.initialize(
[model0, model1, model2],
optimizer,
opt_level=opt_level,
verbosity=0,
cast_model_type=False,
num_losses=num_losses)
_amp_state.allow_incoming_model_not_fp32 = False
_amp_state.loss_scalers[0]._loss_scale = 4.0
if use_multiple_loss_scalers:
_amp_state.loss_scalers[1]._loss_scale = 16.0
unskipped = 0
for i in range(iters):
if how_to_zero == "none":
for model in models:
for param in model.parameters():
param.grad = None
elif how_to_zero == "model":
for model in models:
model.zero_grad()
else:
optimizer.zero_grad()
# print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} which_model {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, which_model, use_multiple_loss_scalers))
loss0 = model0(self.x) + model2(self.x)
loss1 = model1(self.x) + model2(self.x)
with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 0:
if which_model == 0:
inj_model = model0
elif which_model == 2:
inj_model = model2
else:
raise RuntimeError(which_model + " invalid for loss 0")
if inject_inf_loc == "fp32":
inj_model.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
inj_model.weight1.grad[0] = float('inf')
with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 1:
if which_model == 1:
inj_model = model1
elif which_model == 2:
inj_model = model2
else:
raise RuntimeError(which_model + " invalid for loss 1 ")
if inject_inf_loc == "fp32":
inj_model.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
inj_model.weight1.grad[0] = float('inf')
if i != inject_inf:
for param, reference_grad in zip(amp.master_params(optimizer),
reference_grads[unskipped]):
self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
unskipped += 1
optimizer.step()
model_params = [p for p in model0.parameters()] + \
[p for p in model1.parameters()] + \
[p for p in model2.parameters()]
for model, master, reference in zip(
model_params,
amp.master_params(optimizer),
final_params):
self.assertTrue(torch.allclose(model, reference))
self.assertTrue(torch.allclose(model, master.to(model.dtype)))
if opt_level == "O1":
_amp_state.handle._deactivate()
def test_2models2losses2optimizers(self):
model0 = MyModel(1)
model1 = MyModel(2)
optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
momentum=0.125)
optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
momentum=0.25)
# Don't do it like this: reference_grads = [[]]*5
# because then it creates a list of 5 references to the same "[]" and appending
# to any of them effectively makes you append to all of them, which multiplies
# the resulting size of reference_grads by 5x and needless to say makes the test fail.
reference_grads = [[], [], [], [], []]
final_params = [None, None, None, None, None]
for i in range(2):
optimizer0.zero_grad()
optimizer1.zero_grad()
loss0 = model0(self.x)
loss1 = model1(self.x)
loss0.backward()
loss1.backward()
reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
[param.grad.data.clone() for param in model1.parameters()])
optimizer0.step()
optimizer1.step()
final_params[0] = [param.data.clone() for param in model0.parameters()] + \
[param.data.clone() for param in model1.parameters()]
def what_got_skipped(which_iter, which_backward):
if which_iter == 0 and which_backward == 0:
return 1
if which_iter == 0 and which_backward == 1:
return 2
if which_iter == 1 and which_backward == 0:
return 3
if which_iter == 1 and which_backward == 1:
return 4
return 0
for which_iter in (0,1):
for which_backward in (0,1):
model0 = MyModel(1)
model1 = MyModel(2)
optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
momentum=0.125)
optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
momentum=0.25)
for i in range(3):
optimizer0.zero_grad()
optimizer1.zero_grad()
loss0 = model0(self.x)
loss1 = model1(self.x)
loss0.backward()
loss1.backward()
if i != which_iter:
reference_grads[what_got_skipped(which_iter, which_backward)].append(
[param.grad.data.clone() for param in model0.parameters()] +
[param.grad.data.clone() for param in model1.parameters()])
if i == which_iter:
if which_backward == 0:
optimizer1.step()
else:
optimizer0.step()
else:
optimizer0.step()
optimizer1.step()
final_params[what_got_skipped(which_iter, which_backward)] = \
[param.data.clone() for param in model0.parameters()] + \
[param.data.clone() for param in model1.parameters()]
for opt_level in ("O0", "O1", "O2", "O3"):
for how_to_zero in ("none", "model", "optimizer"):
for use_multiple_loss_scalers in (True, False):
if opt_level == "O1" or opt_level == "O2":
inject_inf_iters = (-1, 0, 1)
else:
inject_inf_iters = (-1,)
for inject_inf in inject_inf_iters:
if inject_inf >= 0:
inject_inf_locs = ("fp16", "fp32")
which_backwards = (0, 1)
else:
inject_inf_locs = ("fdsa",)
which_backwards = (None,)
for inject_inf_loc in inject_inf_locs:
for which_backward in which_backwards:
if use_multiple_loss_scalers:
num_losses = 2
loss_ids = [0, 1]
else:
num_losses = 1
loss_ids = [0, 0]
if inject_inf >= 0:
iters = 3
else:
iters = 2
model0 = MyModel(1)
model1 = MyModel(2)
models = [model0, model1]
optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
momentum=0.125)
optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
momentum=0.25)
_amp_state.allow_incoming_model_not_fp32 = True
[model0, model1], [optimizer0, optimizer1] = amp.initialize(
[model0, model1],
[optimizer0, optimizer1],
opt_level=opt_level,
verbosity=0,
cast_model_type=False,
num_losses=num_losses)
_amp_state.allow_incoming_model_not_fp32 = False
_amp_state.loss_scalers[0]._loss_scale = 4.0
if use_multiple_loss_scalers:
_amp_state.loss_scalers[1]._loss_scale = 16.0
unskipped = 0
for i in range(iters):
if how_to_zero == "none":
for model in models:
for param in model.parameters():
param.grad = None
elif how_to_zero == "model":
for model in models:
model.zero_grad()
else:
optimizer0.zero_grad()
optimizer1.zero_grad()
loss0 = model0(self.x)
loss1 = model1(self.x)
with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 0:
if inject_inf_loc == "fp32":
model0.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
model0.weight1.grad[0] = float('inf')
with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 1:
if inject_inf_loc == "fp32":
model1.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
model1.weight1.grad[0] = float('inf')
# print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
if i != inject_inf:
master_params = list(amp.master_params(optimizer0)) + \
list(amp.master_params(optimizer1))
for param, reference_grad in zip(master_params,
reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]):
self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
unskipped += 1
optimizer0.step()
optimizer1.step()
model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
master_params = [p for p in amp.master_params(optimizer0)] + \
[p for p in amp.master_params(optimizer1)]
for model, master, reference in zip(
model_params,
master_params,
final_params[what_got_skipped(inject_inf, which_backward)]):
self.assertTrue(torch.allclose(model, reference))
self.assertTrue(torch.allclose(model, master.to(model.dtype)))
if opt_level == "O1":
_amp_state.handle._deactivate()
def test_3models2losses2optimizers(self):
model0 = MyModel(1)
model1 = MyModel(2)
model2 = MyModel(3)
optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 1.0}],
momentum=0.5)
optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
momentum=0.25)
# Again, can't do this: reference_grads = [[]]*9
reference_grads = [[], [], [], [], [], [], [], [], []]
final_params = [None, None, None, None, None, None, None, None, None]
for i in range(2):
optimizer0.zero_grad()
optimizer1.zero_grad()
loss0 = model0(self.x) + model1(self.x)
loss1 = model2(self.x) + model1(self.x)
loss0.backward()
loss1.backward()
reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
[param.grad.data.clone() for param in model1.parameters()])
optimizer0.step()
optimizer1.step()
final_params[0] = \
[param.data.clone() for param in model0.parameters()] + \
[param.data.clone() for param in model1.parameters()] + \
[param.data.clone() for param in model2.parameters()]
def what_got_skipped(which_iter, which_backward, which_model):
if which_iter == 0:
if which_backward == 0:
if which_model == 0:
return 1
if which_model == 1:
return 2
if which_backward == 1:
if which_model == 2:
return 3
if which_model == 1:
return 4
if which_iter == 1:
if which_backward == 0:
if which_model == 0:
return 5
if which_model == 1:
return 6
if which_backward == 1:
if which_model == 2:
return 7
if which_model == 1:
return 8
return 0
for which_iter in (0,1):
for which_backward in (0,1):
if which_backward == 0:
which_models = (0,1)
if which_backward == 1:
which_models = (2,1)
for which_model in which_models:
model0 = MyModel(1)
model1 = MyModel(2)
model2 = MyModel(3)
optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 1.0}],
momentum=0.5)
optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
momentum=0.25)
for i in range(3):
optimizer0.zero_grad()
optimizer1.zero_grad()
loss0 = model0(self.x) + model1(self.x)
loss1 = model2(self.x) + model1(self.x)
loss0.backward()
loss1.backward()
if i != which_iter:
reference_grads[what_got_skipped(which_iter,
which_backward, which_model)].append(
[param.grad.data.clone() for param in model0.parameters()] +
[param.grad.data.clone() for param in model1.parameters()])
if i == which_iter:
if which_backward == 0:
# if which_model == 0:
optimizer1.step()
# if which_model == 1:
# optimizer1.step()
if which_backward == 1:
# if which_model == 2:
# optimizer0.step()
# if which_model == 1:
continue
else:
optimizer0.step()
optimizer1.step()
final_params[what_got_skipped(which_iter, which_backward, which_model)] = \
[param.data.clone() for param in model0.parameters()] + \
[param.data.clone() for param in model1.parameters()] + \
[param.data.clone() for param in model2.parameters()]
for opt_level in ("O0", "O1", "O2", "O3"):
for how_to_zero in ("none", "model", "optimizer"):
for use_multiple_loss_scalers in (True, False):
if opt_level == "O1" or opt_level == "O2":
inject_inf_iters = (-1, 0, 1)
else:
inject_inf_iters = (-1,)
for inject_inf in inject_inf_iters:
if inject_inf >= 0:
inject_inf_locs = ("fp16", "fp32")
which_backwards = (0, 1)
else:
inject_inf_locs = ("fdsa",)
which_backwards = (None,)
for inject_inf_loc in inject_inf_locs:
for which_backward in which_backwards:
if use_multiple_loss_scalers:
num_losses = 2
loss_ids = [0, 1]
else:
num_losses = 1
loss_ids = [0, 0]
if inject_inf >= 0:
iters = 3
if which_backward == 0:
which_models = (0, 1)
elif which_backward == 1:
which_models = (2, 1)
else:
iters = 2
which_models = (None,)
for which_model in which_models:
model0 = MyModel(1)
model1 = MyModel(2)
model2 = MyModel(3)
models = [model0, model1, model2]
optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
{'params' : model1.parameters(), 'lr' : 1.0}],
momentum=0.5)
optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
momentum=0.25)
_amp_state.allow_incoming_model_not_fp32 = True
[model0, model1, model2], [optimizer0, optimizer1] = amp.initialize(
[model0, model1, model2],
[optimizer0, optimizer1],
opt_level=opt_level,
verbosity=0,
cast_model_type=False,
num_losses=num_losses)
_amp_state.allow_incoming_model_not_fp32 = False
_amp_state.loss_scalers[0]._loss_scale = 4.0
if use_multiple_loss_scalers:
_amp_state.loss_scalers[1]._loss_scale = 16.0
unskipped = 0
for i in range(iters):
if how_to_zero == "none":
for model in models:
for param in model.parameters():
param.grad = None
elif how_to_zero == "model":
for model in models:
model.zero_grad()
else:
optimizer0.zero_grad()
optimizer1.zero_grad()
loss0 = model0(self.x) + model1(self.x)
loss1 = model2(self.x) + model1(self.x)
with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 0:
if which_model == 0:
inj_model = model0
elif which_model == 1:
inj_model = model1
else:
raise RuntimeError(which_model + " invalid for loss 0")
if inject_inf_loc == "fp32":
inj_model.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
inj_model.weight1.grad[0] = float('inf')
with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss:
scaled_loss.backward()
if i == inject_inf and which_backward == 1:
if which_model == 2:
inj_model = model2
elif which_model == 1:
inj_model = model1
else:
raise RuntimeError(which_model + " invalid for loss 1 ")
if inject_inf_loc == "fp32":
inj_model.weight0.grad[0] = float('inf')
elif inject_inf_loc == "fp16":
inj_model.weight1.grad[0] = float('inf')
if i != inject_inf:
master_params = list(amp.master_params(optimizer0)) + \
list(amp.master_params(optimizer1))
for param, reference_grad in zip(master_params,
reference_grads[what_got_skipped(inject_inf,
which_backward, which_model)][unskipped]):
self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
unskipped += 1
optimizer0.step()
optimizer1.step()
model_params = [p for p in model0.parameters()] + \
[p for p in model1.parameters()] + \
[p for p in model2.parameters()]
master_params = [p for p in amp.master_params(optimizer0)] + \
[p for p in amp.master_params(optimizer1)]
# print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model))
for model, master, reference in zip(
model_params,
master_params,
final_params[what_got_skipped(inject_inf, which_backward, which_model)]):
self.assertTrue(torch.allclose(model, reference))
self.assertTrue(torch.allclose(model, master.to(model.dtype)))
if opt_level == "O1":
_amp_state.handle._deactivate()
if __name__ == '__main__':
unittest.main()
import unittest
import functools as ft
import itertools as it
from apex import amp
import torch
from torch import nn
import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
try:
import amp_C
scale_check_overflow = amp_C.scale_check_overflow
disabled = False
except ImportError as err:
print("amp_C fused kernel unavailable, disabling TestScale. ImportError was ", err)
disabled = True
class TestScale(unittest.TestCase):
def setUp(self):
self.scale = 128.0
self.nx = 999
self.ny = 888
self.overflow_buf = torch.cuda.IntTensor([0])
self.fp16 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float16)
self.fp32 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float32)
self.fp16_ref = torch.ones((1, 1), device='cuda', dtype=torch.float16)
self.fp32_ref = torch.ones((1, 1), device='cuda', dtype=torch.float32)
common_init(self)
def tearDown(self):
pass
def downscale_test(self, input, output, ref):
self.overflow_buf.zero_()
input.fill_(1.0)
if input is not output:
output.fill_(3.0)
input.mul_(self.scale)
scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
self.assertTrue(torch.allclose(output, ref))
self.assertTrue(self.overflow_buf.item() == 0)
def find_inf_test(self, input, output, ref, x, y, val):
self.overflow_buf.zero_()
input.fill_(1.0)
if input is not output:
output.fill_(3.0)
input[x,y] = val
scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
self.assertTrue(self.overflow_buf.item())
# Currently, the fused kernel gives a hard error if you attempt to downscale
# into fp16 output, which imo is the desired behavior. Maybe someday we
# will learn otherwise.
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp16_to_fp16(self):
# self.downscale_test(self.fp16, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp16_to_fp32(self):
self.downscale_test(self.fp16, self.fp32, self.fp32_ref)
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp32_to_fp16(self):
# self.downscale_test(self.fp32, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp32_to_fp32(self):
self.downscale_test(self.fp32, self.fp32, self.fp32_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp16_to_fp32_find_inf_nan(self):
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, 0, 0, float('nan'))
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('inf'))
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('nan'))
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp32_to_fp32_find_inf_nan(self):
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, 0, 0, float('inf'))
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('nan'))
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('inf'))
if __name__ == '__main__':
unittest.main()
import unittest
import os
import random
import torch
import apex
class TestFusedLayerNorm(unittest.TestCase):
def setUp(self):
self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=False)
self.input_ = torch.randn(16, 32, 64)
torch.cuda.manual_seed(42)
def forward_cpu(self, input_):
self.module.cpu()
return self.module(input_.cpu())
def forward_cuda(self, input_):
self.module.cuda()
return self.module(input_.cuda())
def test_forward_cuda(self):
out_ = self.forward_cuda(self.input_)
assert out_.is_cuda == True
def test_forward_cpu(self):
out_ = self.forward_cpu(self.input_)
assert out_.is_cuda == False
def test_same_output(self):
out_cpu = self.forward_cpu(self.input_)
out_cuda = self.forward_cuda(self.input_)
torch.testing.assert_allclose(out_cpu, out_cuda.cpu())
class TestFusedLayerNormElemWise(TestFusedLayerNorm):
def setUp(self):
self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=True)
self.input_ = torch.randn(16, 32, 64)
torch.cuda.manual_seed(42)
\ No newline at end of file
import unittest
import sys
test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam"]
test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam", "run_fused_layer_norm"]
runner = unittest.TextTestRunner(verbosity=2)
......
......@@ -6,6 +6,7 @@ parser.add_argument('--opt-level', type=str)
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
parser.add_argument('--loss-scale', type=str, default=None)
parser.add_argument('--fused-adam', action='store_true')
parser.add_argument('--use_baseline', action='store_true')
args = parser.parse_args()
base_file = str(args.opt_level) + "_" +\
......@@ -15,24 +16,49 @@ base_file = str(args.opt_level) + "_" +\
file_e = "True_" + base_file
file_p = "False_" + base_file
if args.use_baseline:
file_b = "baselines/True_" + base_file
dict_e = torch.load(file_e)
dict_p = torch.load(file_p)
if args.use_baseline:
dict_b = torch.load(file_b)
torch.set_printoptions(precision=10)
print(file_e)
print(file_p)
if args.use_baseline:
print(file_b)
for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
loss_e = dict_e["Loss"][n]
loss_p = dict_p["Loss"][n]
assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
i_e,
loss_e,
loss_p,
dict_e["Speed"][n],
dict_p["Speed"][n]))
# ugly duplication here...
if not args.use_baseline:
for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
loss_e = dict_e["Loss"][n]
loss_p = dict_p["Loss"][n]
assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
i_e,
loss_e,
loss_p,
dict_e["Speed"][n],
dict_p["Speed"][n]))
else:
for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
loss_e = dict_e["Loss"][n]
loss_p = dict_p["Loss"][n]
loss_b = dict_b["Loss"][n]
assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(i_e, loss_e, loss_b)
print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
i_e,
loss_b,
loss_e,
loss_p,
dict_b["Speed"][n],
dict_e["Speed"][n],
dict_p["Speed"][n]))
......@@ -365,6 +365,9 @@ def train(train_loader, model, criterion, optimizer, epoch):
batch_time.update(time.time() - end)
end = time.time()
# If you decide to refactor this test, like examples/imagenet, to sample the loss every
# print_freq iterations, make sure to move this prefetching below the accuracy calculation.
input, target = prefetcher.next()
if i % args.print_freq == 0 and i > 1:
......
......@@ -6,8 +6,15 @@ print_banner() {
print_banner "Distributed status: $1"
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/"
DATADIR="/opt/home/apex/examples/imagenet/"
echo $2
DATADIR=$2
if [ -n "$3" ]
then
USE_BASELINE=""
else
USE_BASELINE="--use_baseline"
fi
if [ "$1" == "single_gpu" ]
then
......@@ -49,7 +56,7 @@ set -e
print_banner "Installing Apex with --cuda_ext and --cpp_ext"
pushd ../../..
python setup.py install --cuda_ext --cpp_ext
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
popd
for opt_level in "${opt_levels[@]}"
......@@ -86,7 +93,7 @@ done
print_banner "Reinstalling apex without extensions"
pushd ../../..
python setup.py install
pip install -v --no-cache-dir .
popd
for opt_level in "${opt_levels[@]}"
......@@ -124,7 +131,7 @@ do
fi
echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
set -x
python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm}
python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
set +x
done
done
......@@ -133,5 +140,5 @@ done
print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"
pushd ../../..
python setup.py install --cuda_ext --cpp_ext
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
popd
#!/bin/bash
DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
# DATADIR="/opt/home/apex/examples/imagenet/"
cp ../common/* .
bash run_test.sh single_gpu
bash run_test.sh single_gpu $1 $DATADIR yes
#!/bin/bash
cp ../common/* .
bash run_test.sh distributed
bash run_test.sh distributed $1
import torch
import argparse
import os
from apex import amp
# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from apex.parallel import DistributedDataParallel
parser = argparse.ArgumentParser()
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
# FOR DISTRIBUTED: If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
if args.distributed:
# FOR DISTRIBUTED: Set the device according to local_rank.
torch.cuda.set_device(args.local_rank)
# FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will provide
# environment variables, and requires that you use init_method=`env://`.
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
torch.manual_seed(torch.distributed.get_rank())
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
# FOR DISTRIBUTED: After amp.initialize, wrap the model with
# apex.parallel.DistributedDataParallel.
model = DistributedDataParallel(model)
# torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
# model = torch.nn.parallel.DistributedDataParallel(model,
# device_ids=[args.local_rank],
# output_device=args.local_rank)
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred, y)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if args.local_rank == 0:
print("final loss = ", loss)
torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
import torch
model_params_rank0 = torch.load("rank0model.pth",
map_location = lambda storage, loc: storage.cuda(0))
model_params_rank1 = torch.load("rank1model.pth",
map_location = lambda storage, loc: storage.cuda(0))
master_params_rank0 = torch.load("rank0master.pth",
map_location = lambda storage, loc: storage.cuda(0))
master_params_rank1 = torch.load("rank1master.pth",
map_location = lambda storage, loc: storage.cuda(0))
for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
model_params_rank0,
model_params_rank1,
master_params_rank0,
master_params_rank1):
assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
# Some debugging/investigation assistance code:
# maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
# offending_val_half = model_rank0.view(-1)[maxind.item()]
# offending_val_float = master_rank0.view(-1)[maxind.item()]
# print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
# offending_val_float.half().item())
# rtol needs to be > 2^-11 because of denormals...
assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch"
print("OK: Model and master params match across ranks.")
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py
python compare.py
#!/bin/bash
print_banner() {
printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
}
print_green() {
printf "\e[30m\e[42m$1\e[0m\n"
}
print_red() {
printf "\e[30m\e[41m$1\e[0m\n"
}
images=(
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-devel"
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
"pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
"pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
"pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
)
branch="master"
# Associative array for exit codes
declare -A exit_codes
for image in images
do
exit_codes[$image]="None"
done
for image in "${images[@]}"
do
print_banner "$image"
set -x
docker pull $image
# Trying python setup.py install instead of pip install to ensure direct access to error codes.
# Maybe pip install would be ok too but this works.
docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e; python setup.py install --cuda_ext --cpp_ext"
exit_code=$?
set +x
if [ $exit_code != 0 ]
then
print_red "Exit code: $exit_code"
else
print_green "Exit code: $exit_code"
fi
exit_codes[$image]=$exit_code
done
success=0
for image in "${images[@]}"
do
exit_code=${exit_codes[$image]}
if [ $exit_code != 0 ]
then
print_red "$image : $exit_code"
success=1
else
print_green "$image : $exit_code"
fi
done
if [ $success != 0 ]
then
print_red "Overall status: failure"
else
print_green "Overall status: success"
fi
exit $success
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment