Commit 2cbca1a4 authored by Michael Carilli's avatar Michael Carilli
Browse files

Merge branch 'master' into api_refactor

parents a9a3fe57 340e71a4
......@@ -18,12 +18,23 @@ class Dictionary(object):
class Corpus(object):
def __init__(self, path):
def __init__(self, path, pad_to_multiple_of=1):
# Synthetic elements used to pad the dictionary length.
# It is assumed that these synthetic elements do not appear in the actual data files.
self.synthetic = ["vvvvvvvv" + str(i) for i in range(pad_to_multiple_of-1)]
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, 'train.txt'))
self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
self.test = self.tokenize(os.path.join(path, 'test.txt'))
# Pad dictionary size to desired multiple. For example, padding to a multiple of 8
# is necessary to ensure Tensor Core usage for the decoder.
pad_elem = pad_to_multiple_of - len(self.dictionary)%pad_to_multiple_of
if pad_elem != pad_to_multiple_of:
for i in range(pad_elem):
self.dictionary.add_word(self.synthetic[i])
def tokenize(self, path):
"""Tokenizes a text file."""
assert os.path.exists(path)
......
......@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200,
parser.add_argument('--emsize', type=int, default=1504,
help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
parser.add_argument('--nhid', type=int, default=1504,
help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
help='number of layers')
......@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
parser.add_argument('--batch_size', type=int, default=24, metavar='N',
help='batch size')
parser.add_argument('--bptt', type=int, default=35,
help='sequence length')
parser.add_argument('--dropout', type=float, default=0.2,
parser.add_argument('--dropout', type=float, default=0.65,
help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true',
help='tie the word embedding and softmax weights')
......@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
help='path to save the final model')
parser.add_argument('--fp16', action='store_true',
help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
parser.add_argument('--static-loss-scale', type=float, default=1,
parser.add_argument('--static-loss-scale', type=float, default=128.0,
help='Static loss scale, positive power of 2 values can improve fp16 convergence.')
args = parser.parse_args()
......@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda:
# Load data
###############################################################################
corpus = data.Corpus(args.data)
# Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus = data.Corpus(args.data, pad_to_multiple_of=8)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
......@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
ntokens = len(corpus.dictionary)
if args.fp16 and args.cuda:
if ntokens%8 != 0:
print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs.".format(ntokens))
if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs.".format(
args.emsize, args.nhid, args.batch_size))
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16:
......@@ -106,6 +118,12 @@ if args.cuda and args.fp16:
model_params, master_params = prep_param_lists(model)
elif args.cuda:
model.cuda()
if (not args.fp16) or (not args.cuda):
print("Warning: static_loss_scale != 1.0 is only necessary with --fp16. "
"Resetting static_loss_scale to 1.0")
args.static_loss_scale = 1.0
criterion = nn.CrossEntropyLoss()
###############################################################################
......@@ -172,21 +190,21 @@ def train():
loss = criterion(output.view(-1, ntokens), targets)
loss = loss * args.static_loss_scale
loss.backward()
loss = loss / args.static_loss_scale
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
# apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm"
# and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.
# It's not FP16-specific, just a small fix to avoid deprecation warnings.
clip_grad_norm(model.parameters(), args.clip)
loss.data = loss.data / args.static_loss_scale
if args.fp16 and args.cuda:
model_grads_to_master_grads(model_params, master_params)
if args.static_loss_scale != 1:
for param in master_params:
param.grad.data = param.grad.data/args.static_loss_scale
clip_grad_norm(master_params, args.clip)
for param in master_params:
param.data = param.data - param.grad.data * (lr/args.static_loss_scale)
param.data = param.data - param.grad.data * lr
master_params_to_model_params(model_params, master_params)
else:
clip_grad_norm(model.parameters(), args.clip)
for p in model.parameters():
p.data.add_(-lr/args.static_loss_scale, p.grad.data)
p.data.add_(-lr, p.grad.data)
total_loss += loss.data
......
......@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200,
parser.add_argument('--emsize', type=int, default=1504,
help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
parser.add_argument('--nhid', type=int, default=1504,
help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
help='number of layers')
......@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
parser.add_argument('--batch_size', type=int, default=24, metavar='N',
help='batch size')
parser.add_argument('--bptt', type=int, default=35,
help='sequence length')
......@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
help='path to save the final model')
parser.add_argument('--fp16', action='store_true',
help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
parser.add_argument('--static-loss-scale', type=float, default=1,
parser.add_argument('--static-loss-scale', type=float, default=128.0,
help='Static loss scale, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--dynamic-loss-scale', action='store_true',
help='Use dynamic loss scaling. If supplied, this argument supersedes ' +
......@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda:
# Load data
###############################################################################
corpus = data.Corpus(args.data)
# Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus = data.Corpus(args.data, pad_to_multiple_of=8)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
......@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
ntokens = len(corpus.dictionary)
if args.fp16 and args.cuda:
if ntokens%8 != 0:
print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs.".format(ntokens))
if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs.".format(
args.emsize, args.nhid, args.batch_size))
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16:
......
......@@ -36,6 +36,10 @@ if "--cuda_ext" in sys.argv:
if torch.utils.cpp_extension.CUDA_HOME is None:
print("Warning: nvcc is not available. Ignoring --cuda-ext")
else:
ext_modules.append(
CUDAExtension(name='amp_C',
sources=['csrc/scale_check_overflow.cpp',
'csrc/scale_check_overflow_kernel.cu']))
ext_modules.append(
CUDAExtension(name='fused_adam_cuda',
sources=['apex/optimizers/csrc/fused_adam_cuda.cpp',
......
......@@ -34,8 +34,8 @@ class Model(Module):
return (input*self.a)*self.b
model = Model()
model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
# model = DDP(model, delay_allreduce=True)
# model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
model = DDP(model, delay_allreduce=True)
# model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
x = torch.cuda.FloatTensor(4096*4096)
......
import unittest
import functools as ft
import itertools as it
from apex import amp
import torch
from torch import nn
import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
try:
import amp_C
scale_check_overflow = amp_C.scale_check_overflow
disabled = False
except ImportError as err:
print("amp_C fused kernel unavailable, disabling TestScale. ImportError was ", err)
disabled = True
class TestScale(unittest.TestCase):
def setUp(self):
self.scale = 128.0
self.nx = 999
self.ny = 888
self.overflow_buf = torch.cuda.IntTensor([0])
self.fp16 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float16)
self.fp32 = torch.ones((self.ny, self.nx), device='cuda', dtype=torch.float32)
self.fp16_ref = torch.ones((1, 1), device='cuda', dtype=torch.float16)
self.fp32_ref = torch.ones((1, 1), device='cuda', dtype=torch.float32)
common_init(self)
def tearDown(self):
pass
def downscale_test(self, input, output, ref):
self.overflow_buf.zero_()
input.fill_(1.0)
if input is not output:
output.fill_(3.0)
input.mul_(self.scale)
scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
self.assertTrue(torch.allclose(output, ref))
self.assertTrue(self.overflow_buf.item() == 0)
def find_inf_test(self, input, output, ref, x, y, val):
self.overflow_buf.zero_()
input.fill_(1.0)
if input is not output:
output.fill_(3.0)
input[x,y] = val
scale_check_overflow(input, 1./self.scale, self.overflow_buf, output)
self.assertTrue(self.overflow_buf.item())
# Currently, the fused kernel gives a hard error if you attempt to downscale
# into fp16 output, which imo is the desired behavior. Maybe someday we
# will learn otherwise.
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp16_to_fp16(self):
# self.downscale_test(self.fp16, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp16_to_fp32(self):
self.downscale_test(self.fp16, self.fp32, self.fp32_ref)
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp32_to_fp16(self):
# self.downscale_test(self.fp32, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp32_to_fp32(self):
self.downscale_test(self.fp32, self.fp32, self.fp32_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp16_to_fp32_find_inf_nan(self):
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, 0, 0, float('nan'))
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('inf'))
self.find_inf_test(self.fp16, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('nan'))
@unittest.skipIf(disabled, "amp_C is unavailable")
def test_fp32_to_fp32_find_inf_nan(self):
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, 0, 0, float('inf'))
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny//2, self.nx//2, float('nan'))
self.find_inf_test(self.fp32, self.fp32, self.fp32_ref, self.ny-1, self.nx-1, float('inf'))
if __name__ == '__main__':
unittest.main()
......@@ -6,6 +6,8 @@ import itertools as it
import torch
from apex.fp16_utils import FP16_Optimizer
# Currently no-ops (tested via examples).
# FP16_Optimizer to be deprecated and moved under unified Amp API.
class TestFP16Optimizer(unittest.TestCase):
def setUp(self):
N, D_in, D_out = 64, 1024, 16
......
import unittest
import torch
import apex
class TestFP16Optimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(13337)
N, D_in, D_out = 64, 1024, 16
self.N = N
self.D_in = D_in
self.D_out = D_out
self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda')
self.ref_model = torch.nn.Linear(D_in, D_out).cuda().half()
self.tst_model = torch.nn.Linear(D_in, D_out).cuda().half()
for p,q in zip(self.tst_model.parameters(), self.ref_model.parameters()):
p.data.copy_(q.data)
def get_max_diff(self, ref_param, tst_param):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
max_abs_diff_p = (p_ref - p_tst).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def test_fp16_optimizer(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_loss_scaling(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_parameter_groups(self):
ref_groups = [{'params': [self.ref_model.weight]},{'params': [self.ref_model.bias]}]
ref_optim = torch.optim.Adam(ref_groups)
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}]
tst_optim = apex.optimizers.FusedAdam(tst_groups)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_grad_clip(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters(), max_grad_norm=0.01)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.clip_master_grads(0.01)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@unittest.skip('Not support grad being None')
def test_grad_None(self):
self.fail()
@unittest.skip('Not support same weight decay as pytorch')
def test_weight_decay(self):
self.fail()
@unittest.skip('Not support empty parameter groups')
def test_group_empty(self):
self.fail()
if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main()
import unittest
import sys
test_dirs = ["run_fp16_optimizer", "run_amp", "run_mixed_adam"]
test_dirs = ["run_amp", "run_mixed_adam"]
runner = unittest.TextTestRunner(verbosity=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment