Commit 843cdbe0 authored by Michael Carilli's avatar Michael Carilli
Browse files

Merging in master

parents 724672d7 28097c99
import unittest
import functools as ft
import itertools as it
from apex import amp
import torch
from torch import nn
import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
try:
import amp_C
from amp_C import multi_tensor_l2norm
from apex.multi_tensor_apply import MultiTensorApply
disabled = False
except ImportError as err:
print("amp_C fused kernels unavailable, disabling TestMultiTensorApply. ImportError was ", err)
disabled = True
class TestMultiTensorL2Norm(unittest.TestCase):
def setUp(self):
common_init(self)
self.val = 4.0
self.overflow_buf = torch.cuda.IntTensor(1).zero_()
def tearDown(self):
pass
# The tensor creation here is written for convenience, not speed.
def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type):
self.overflow_buf.zero_()
a = torch.cuda.FloatTensor(sizea).fill_(self.val)
b = torch.cuda.FloatTensor(sizeb).fill_(self.val)
in_list = []
for i in range(repeat_tensors):
in_list += [a.clone().to(in_type), b.clone().to(in_type)]
norm = applier(multi_tensor_l2norm, self.overflow_buf, [in_list])
reference = torch.cuda.FloatTensor((sizea + sizeb)*repeat_tensors).fill_(self.val).norm()
self.assertTrue(torch.allclose(norm, reference))
self.assertTrue(self.overflow_buf.item() == 0)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fuzz(self):
input_size_pairs = (
(7777*77, 555*555),
(777, 555),
(555, 2048*32+1),
(2048*32+1, 555),
(555, 2048*32),
(2048*32, 555),
(33333, 555),
(555, 33333))
appliers = (
MultiTensorApply(2048*32),
MultiTensorApply(333),
MultiTensorApply(33333))
repeat_tensors = (
1,
55)
for sizea, sizeb in input_size_pairs:
for applier in appliers:
for repeat in repeat_tensors:
for in_type in (torch.float32, torch.float16):
self.l2norm(sizea, sizeb, applier, repeat, in_type, )
if __name__ == '__main__':
unittest.main()
...@@ -24,12 +24,11 @@ except ImportError as err: ...@@ -24,12 +24,11 @@ except ImportError as err:
class TestMultiTensorScale(unittest.TestCase): class TestMultiTensorScale(unittest.TestCase):
def setUp(self): def setUp(self):
common_init(self)
self.scale = 4.0 self.scale = 4.0
self.overflow_buf = torch.cuda.IntTensor(1).zero_() self.overflow_buf = torch.cuda.IntTensor(1).zero_()
self.ref = torch.cuda.FloatTensor([1.0]) self.ref = torch.cuda.FloatTensor([1.0])
common_init(self)
def tearDown(self): def tearDown(self):
pass pass
......
This diff is collapsed.
import unittest
import functools as ft
import itertools as it
from apex import amp
import torch
from torch import nn
import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
try:
import amp_C
scale_check_overflow = amp_C.scale_check_overflow
disabled = False
except ImportError as err:
print("amp_C fused kernel unavailable, disabling TestScale. ImportError was ", err)
disabled = True
class TestScale(unittest.TestCase):
def setUp(self):
self.scale = 128.0
self.nx = 999
self.ny = 888
self.overflow_buf = torch.cuda.IntTensor([0])
self.fp16 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float16)
self.fp32 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float32)
self.fp16_ref = torch.ones((1, 1), device='cuda', dtype=torch.float16)
self.fp32_ref = torch.ones((1, 1), device='cuda', dtype=torch.float32)
common_init(self)
def tearDown(self):
pass
def downscale_test(self, input, output, ref):
self.overflow_buf.zero_()
input.fill_(1.0)
if input is not output:
output.fill_(3.0)
input.mul_(self.scale)
scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
self.assertTrue(torch.allclose(output, ref))
self.assertTrue(self.overflow_buf.item() == 0)
def find_inf_test(self, input, output, ref, x, y, val):
self.overflow_buf.zero_()
input.fill_(1.0)
if input is not output:
output.fill_(3.0)
input[x,y] = val
scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
self.assertTrue(self.overflow_buf.item())
# Currently, the fused kernel gives a hard error if you attempt to downscale
# into fp16 output, which imo is the desired behavior. Maybe someday we
# will learn otherwise.
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp16_to_fp16(self):
# self.downscale_test(self.fp16, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp16_to_fp32(self):
self.downscale_test(self.fp16, self.fp32, self.fp32_ref)
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp32_to_fp16(self):
# self.downscale_test(self.fp32, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp32_to_fp32(self):
self.downscale_test(self.fp32, self.fp32, self.fp32_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp16_to_fp32_find_inf_nan(self):
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, 0, 0, float('nan'))
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('inf'))
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('nan'))
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp32_to_fp32_find_inf_nan(self):
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, 0, 0, float('inf'))
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('nan'))
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('inf'))
if __name__ == '__main__':
unittest.main()
import unittest
import os
import random
import torch
import apex
class TestFusedLayerNorm(unittest.TestCase):
def setUp(self):
self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=False)
self.input_ = torch.randn(16, 32, 64)
torch.cuda.manual_seed(42)
def forward_cpu(self, input_):
self.module.cpu()
return self.module(input_.cpu())
def forward_cuda(self, input_):
self.module.cuda()
return self.module(input_.cuda())
def test_forward_cuda(self):
out_ = self.forward_cuda(self.input_)
assert out_.is_cuda == True
def test_forward_cpu(self):
out_ = self.forward_cpu(self.input_)
assert out_.is_cuda == False
def test_same_output(self):
out_cpu = self.forward_cpu(self.input_)
out_cuda = self.forward_cuda(self.input_)
torch.testing.assert_allclose(out_cpu, out_cuda.cpu())
class TestFusedLayerNormElemWise(TestFusedLayerNorm):
def setUp(self):
self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=True)
self.input_ = torch.randn(16, 32, 64)
torch.cuda.manual_seed(42)
\ No newline at end of file
import unittest import unittest
import sys import sys
test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam"] test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam", "run_fused_layer_norm"]
runner = unittest.TextTestRunner(verbosity=2) runner = unittest.TextTestRunner(verbosity=2)
......
...@@ -6,6 +6,7 @@ parser.add_argument('--opt-level', type=str) ...@@ -6,6 +6,7 @@ parser.add_argument('--opt-level', type=str)
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None) parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
parser.add_argument('--loss-scale', type=str, default=None) parser.add_argument('--loss-scale', type=str, default=None)
parser.add_argument('--fused-adam', action='store_true') parser.add_argument('--fused-adam', action='store_true')
parser.add_argument('--use_baseline', action='store_true')
args = parser.parse_args() args = parser.parse_args()
base_file = str(args.opt_level) + "_" +\ base_file = str(args.opt_level) + "_" +\
...@@ -15,16 +16,24 @@ base_file = str(args.opt_level) + "_" +\ ...@@ -15,16 +16,24 @@ base_file = str(args.opt_level) + "_" +\
file_e = "True_" + base_file file_e = "True_" + base_file
file_p = "False_" + base_file file_p = "False_" + base_file
if args.use_baseline:
file_b = "baselines/True_" + base_file
dict_e = torch.load(file_e) dict_e = torch.load(file_e)
dict_p = torch.load(file_p) dict_p = torch.load(file_p)
if args.use_baseline:
dict_b = torch.load(file_b)
torch.set_printoptions(precision=10) torch.set_printoptions(precision=10)
print(file_e) print(file_e)
print(file_p) print(file_p)
if args.use_baseline:
print(file_b)
for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])): # ugly duplication here...
if not args.use_baseline:
for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p) assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
loss_e = dict_e["Loss"][n] loss_e = dict_e["Loss"][n]
...@@ -36,3 +45,20 @@ for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])): ...@@ -36,3 +45,20 @@ for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
loss_p, loss_p,
dict_e["Speed"][n], dict_e["Speed"][n],
dict_p["Speed"][n])) dict_p["Speed"][n]))
else:
for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
loss_e = dict_e["Loss"][n]
loss_p = dict_p["Loss"][n]
loss_b = dict_b["Loss"][n]
assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(i_e, loss_e, loss_b)
print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
i_e,
loss_b,
loss_e,
loss_p,
dict_b["Speed"][n],
dict_e["Speed"][n],
dict_p["Speed"][n]))
...@@ -365,6 +365,9 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -365,6 +365,9 @@ def train(train_loader, model, criterion, optimizer, epoch):
batch_time.update(time.time() - end) batch_time.update(time.time() - end)
end = time.time() end = time.time()
# If you decide to refactor this test, like examples/imagenet, to sample the loss every
# print_freq iterations, make sure to move this prefetching below the accuracy calculation.
input, target = prefetcher.next() input, target = prefetcher.next()
if i % args.print_freq == 0 and i > 1: if i % args.print_freq == 0 and i > 1:
......
...@@ -6,8 +6,15 @@ print_banner() { ...@@ -6,8 +6,15 @@ print_banner() {
print_banner "Distributed status: $1" print_banner "Distributed status: $1"
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/" echo $2
DATADIR="/opt/home/apex/examples/imagenet/" DATADIR=$2
if [ -n "$3" ]
then
USE_BASELINE=""
else
USE_BASELINE="--use_baseline"
fi
if [ "$1" == "single_gpu" ] if [ "$1" == "single_gpu" ]
then then
...@@ -49,7 +56,7 @@ set -e ...@@ -49,7 +56,7 @@ set -e
print_banner "Installing Apex with --cuda_ext and --cpp_ext" print_banner "Installing Apex with --cuda_ext and --cpp_ext"
pushd ../../.. pushd ../../..
python setup.py install --cuda_ext --cpp_ext pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
popd popd
for opt_level in "${opt_levels[@]}" for opt_level in "${opt_levels[@]}"
...@@ -86,7 +93,7 @@ done ...@@ -86,7 +93,7 @@ done
print_banner "Reinstalling apex without extensions" print_banner "Reinstalling apex without extensions"
pushd ../../.. pushd ../../..
python setup.py install pip install -v --no-cache-dir .
popd popd
for opt_level in "${opt_levels[@]}" for opt_level in "${opt_levels[@]}"
...@@ -124,7 +131,7 @@ do ...@@ -124,7 +131,7 @@ do
fi fi
echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR" echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
set -x set -x
python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
set +x set +x
done done
done done
...@@ -133,5 +140,5 @@ done ...@@ -133,5 +140,5 @@ done
print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext" print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"
pushd ../../.. pushd ../../..
python setup.py install --cuda_ext --cpp_ext pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
popd popd
#!/bin/bash #!/bin/bash
DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
# DATADIR="/opt/home/apex/examples/imagenet/"
cp ../common/* . cp ../common/* .
bash run_test.sh single_gpu bash run_test.sh single_gpu $1 $DATADIR yes
#!/bin/bash #!/bin/bash
cp ../common/* . cp ../common/* .
bash run_test.sh distributed bash run_test.sh distributed $1
import torch
import argparse
import os
from apex import amp
# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from apex.parallel import DistributedDataParallel
parser = argparse.ArgumentParser()
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
# FOR DISTRIBUTED: If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
if args.distributed:
# FOR DISTRIBUTED: Set the device according to local_rank.
torch.cuda.set_device(args.local_rank)
# FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will provide
# environment variables, and requires that you use init_method=`env://`.
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
torch.manual_seed(torch.distributed.get_rank())
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
# FOR DISTRIBUTED: After amp.initialize, wrap the model with
# apex.parallel.DistributedDataParallel.
model = DistributedDataParallel(model)
# torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
# model = torch.nn.parallel.DistributedDataParallel(model,
# device_ids=[args.local_rank],
# output_device=args.local_rank)
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred, y)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if args.local_rank == 0:
print("final loss = ", loss)
torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
import torch
model_params_rank0 = torch.load("rank0model.pth",
map_location = lambda storage, loc: storage.cuda(0))
model_params_rank1 = torch.load("rank1model.pth",
map_location = lambda storage, loc: storage.cuda(0))
master_params_rank0 = torch.load("rank0master.pth",
map_location = lambda storage, loc: storage.cuda(0))
master_params_rank1 = torch.load("rank1master.pth",
map_location = lambda storage, loc: storage.cuda(0))
for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
model_params_rank0,
model_params_rank1,
master_params_rank0,
master_params_rank1):
assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
# Some debugging/investigation assistance code:
# maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
# offending_val_half = model_rank0.view(-1)[maxind.item()]
# offending_val_float = master_rank0.view(-1)[maxind.item()]
# print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
# offending_val_float.half().item())
# rtol needs to be > 2^-11 because of denormals...
assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch"
print("OK: Model and master params match across ranks.")
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py
python compare.py
#!/bin/bash
print_banner() {
printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
}
print_green() {
printf "\e[30m\e[42m$1\e[0m\n"
}
print_red() {
printf "\e[30m\e[41m$1\e[0m\n"
}
images=(
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-devel"
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
"pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
"pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
"pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
)
branch="master"
# Associative array for exit codes
declare -A exit_codes
for image in images
do
exit_codes[$image]="None"
done
for image in "${images[@]}"
do
print_banner "$image"
set -x
docker pull $image
# Trying python setup.py install instead of pip install to ensure direct access to error codes.
# Maybe pip install would be ok too but this works.
docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e; python setup.py install --cuda_ext --cpp_ext"
exit_code=$?
set +x
if [ $exit_code != 0 ]
then
print_red "Exit code: $exit_code"
else
print_green "Exit code: $exit_code"
fi
exit_codes[$image]=$exit_code
done
success=0
for image in "${images[@]}"
do
exit_code=${exit_codes[$image]}
if [ $exit_code != 0 ]
then
print_red "$image : $exit_code"
success=1
else
print_green "$image : $exit_code"
fi
done
if [ $success != 0 ]
then
print_red "Overall status: failure"
else
print_green "Overall status: success"
fi
exit $success
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment