"tests/vscode:/vscode.git/clone" did not exist on "7fb481f840b5d73982cafd1affe89f21a5c0b20b"
Commit b7f1b050 authored by Neel Kant's avatar Neel Kant
Browse files

Lint whole repo

parent c99fa80c
...@@ -27,7 +27,8 @@ try: ...@@ -27,7 +27,8 @@ try:
from functools import lru_cache from functools import lru_cache
except ImportError: except ImportError:
# Just a dummy decorator to get the checks to run on python2 # Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. # because honestly I don't want to support a byte-level unicode BPE
# tokenizer on python 2 right now.
def lru_cache(): def lru_cache():
return lambda func: func return lambda func: func
...@@ -48,6 +49,7 @@ VOCAB_NAME = 'vocab.json' ...@@ -48,6 +49,7 @@ VOCAB_NAME = 'vocab.json'
MERGES_NAME = 'merges.txt' MERGES_NAME = 'merges.txt'
SPECIAL_TOKENS_NAME = 'special_tokens.txt' SPECIAL_TOKENS_NAME = 'special_tokens.txt'
@lru_cache() @lru_cache()
def bytes_to_unicode(): def bytes_to_unicode():
""" """
...@@ -60,17 +62,19 @@ def bytes_to_unicode(): ...@@ -60,17 +62,19 @@ def bytes_to_unicode():
And avoids mapping to whitespace/control characters the bpe code barfs on. And avoids mapping to whitespace/control characters the bpe code barfs on.
""" """
_chr = unichr if sys.version_info[0] == 2 else chr _chr = unichr if sys.version_info[0] == 2 else chr
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
list(range(ord("®"), ord("ÿ") + 1))
cs = bs[:] cs = bs[:]
n = 0 n = 0
for b in range(2**8): for b in range(2**8):
if b not in bs: if b not in bs:
bs.append(b) bs.append(b)
cs.append(2**8+n) cs.append(2**8 + n)
n += 1 n += 1
cs = [_chr(n) for n in cs] cs = [_chr(n) for n in cs]
return dict(zip(bs, cs)) return dict(zip(bs, cs))
def get_pairs(word): def get_pairs(word):
"""Return set of symbol pairs in a word. """Return set of symbol pairs in a word.
...@@ -83,6 +87,7 @@ def get_pairs(word): ...@@ -83,6 +87,7 @@ def get_pairs(word):
prev_char = char prev_char = char
return pairs return pairs
class GPT2Tokenizer(object): class GPT2Tokenizer(object):
""" """
GPT-2 BPE tokenizer. Peculiarities: GPT-2 BPE tokenizer. Peculiarities:
...@@ -138,23 +143,31 @@ class GPT2Tokenizer(object): ...@@ -138,23 +143,31 @@ class GPT2Tokenizer(object):
special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
else: else:
special_tokens = kwargs.pop('special_tokens', []) special_tokens = kwargs.pop('special_tokens', [])
tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) tokenizer = cls(
resolved_vocab_file,
resolved_merges_file,
special_tokens=special_tokens,
*inputs,
**kwargs)
return tokenizer return tokenizer
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): def __init__(self, vocab_file, merges_file, errors='replace',
special_tokens=None, max_len=None):
self.max_len = max_len if max_len is not None else int(1e12) self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file)) self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode() self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data] bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {} self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions # Should haved added re.IGNORECASE so BPE merges can happen for
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") # capitalized versions of contractions
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.special_tokens = {} self.special_tokens = {}
self.special_tokens_decoder = {} self.special_tokens_decoder = {}
...@@ -172,8 +185,9 @@ class GPT2Tokenizer(object): ...@@ -172,8 +185,9 @@ class GPT2Tokenizer(object):
self.special_tokens = {} self.special_tokens = {}
self.special_tokens_decoder = {} self.special_tokens_decoder = {}
return return
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) self.special_tokens = dict((tok, len(self.encoder) + i)
self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} for i, tok in enumerate(special_tokens))
self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
logger.info("Special tokens {}".format(self.special_tokens)) logger.info("Special tokens {}".format(self.special_tokens))
def bpe(self, token): def bpe(self, token):
...@@ -186,7 +200,7 @@ class GPT2Tokenizer(object): ...@@ -186,7 +200,7 @@ class GPT2Tokenizer(object):
return token return token
while True: while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks: if bigram not in self.bpe_ranks:
break break
first, second = bigram first, second = bigram
...@@ -197,12 +211,12 @@ class GPT2Tokenizer(object): ...@@ -197,12 +211,12 @@ class GPT2Tokenizer(object):
j = word.index(first, i) j = word.index(first, i)
new_word.extend(word[i:j]) new_word.extend(word[i:j])
i = j i = j
except: except BaseException:
new_word.extend(word[i:]) new_word.extend(word[i:])
break break
if word[i] == first and i < len(word)-1 and word[i+1] == second: if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first+second) new_word.append(first + second)
i += 2 i += 2
else: else:
new_word.append(word[i]) new_word.append(word[i])
...@@ -245,7 +259,8 @@ class GPT2Tokenizer(object): ...@@ -245,7 +259,8 @@ class GPT2Tokenizer(object):
logger.warning( logger.warning(
"Token indices sequence length is longer than the specified maximum " "Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this" " sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors".format(len(ids), self.max_len) " sequence through the model will result in indexing errors".format(
len(ids), self.max_len)
) )
return ids return ids
......
...@@ -99,19 +99,19 @@ class BertTokenizer(object): ...@@ -99,19 +99,19 @@ class BertTokenizer(object):
[(ids, tok) for tok, ids in self.vocab.items()]) [(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize: if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split) never_split=never_split)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12) self.max_len = max_len if max_len is not None else int(1e12)
def tokenize(self, text): def tokenize(self, text):
if self.do_basic_tokenize: if self.do_basic_tokenize:
split_tokens = [] split_tokens = []
for token in self.basic_tokenizer.tokenize(text): for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token): for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token) split_tokens.append(sub_token)
else: else:
split_tokens = self.wordpiece_tokenizer.tokenize(text) split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens return split_tokens
def convert_tokens_to_ids(self, tokens): def convert_tokens_to_ids(self, tokens):
...@@ -123,7 +123,8 @@ class BertTokenizer(object): ...@@ -123,7 +123,8 @@ class BertTokenizer(object):
logger.warning( logger.warning(
"Token indices sequence length is longer than the specified maximum " "Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this" " sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len) " sequence through BERT will result in indexing errors".format(
len(ids), self.max_len)
) )
return ids return ids
......
...@@ -28,15 +28,17 @@ from megatron.module import MegatronModule ...@@ -28,15 +28,17 @@ from megatron.module import MegatronModule
FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
def conversion_helper(val, conversion): def conversion_helper(val, conversion):
"""Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure.""" """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
if not isinstance(val, (tuple, list)): if not isinstance(val, (tuple, list)):
return conversion(val) return conversion(val)
rtn = [conversion_helper(v, conversion) for v in val] rtn = [conversion_helper(v, conversion) for v in val]
if isinstance(val, tuple): if isinstance(val, tuple):
rtn = tuple(rtn) rtn = tuple(rtn)
return rtn return rtn
def fp32_to_fp16(val): def fp32_to_fp16(val):
"""Convert fp32 `val` to fp16""" """Convert fp32 `val` to fp16"""
def half_conversion(val): def half_conversion(val):
...@@ -48,6 +50,7 @@ def fp32_to_fp16(val): ...@@ -48,6 +50,7 @@ def fp32_to_fp16(val):
return val return val
return conversion_helper(val, half_conversion) return conversion_helper(val, half_conversion)
def fp16_to_fp32(val): def fp16_to_fp32(val):
"""Convert fp16 `val` to fp32""" """Convert fp16 `val` to fp32"""
def float_conversion(val): def float_conversion(val):
...@@ -59,6 +62,7 @@ def fp16_to_fp32(val): ...@@ -59,6 +62,7 @@ def fp16_to_fp32(val):
return val return val
return conversion_helper(val, float_conversion) return conversion_helper(val, float_conversion)
class FP16_Module(MegatronModule): class FP16_Module(MegatronModule):
def __init__(self, module): def __init__(self, module):
super(FP16_Module, self).__init__() super(FP16_Module, self).__init__()
...@@ -79,9 +83,11 @@ class FP16_Module(MegatronModule): ...@@ -79,9 +83,11 @@ class FP16_Module(MegatronModule):
self.module.load_state_dict(state_dict, strict=strict) self.module.load_state_dict(state_dict, strict=strict)
# TODO: Update overflow check + downscale to use Carl's fused kernel. # TODO: Update overflow check + downscale to use Carl's fused kernel.
class FP16_Optimizer(object): class FP16_Optimizer(object):
""" """
:class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
and manage static or dynamic loss scaling and master weights in a manner transparent to the user. and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance, For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance,
and changing the call to ``backward``. and changing the call to ``backward``.
...@@ -104,45 +110,45 @@ class FP16_Optimizer(object): ...@@ -104,45 +110,45 @@ class FP16_Optimizer(object):
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
# optional arg to control dynamic loss scaling behavior # optional arg to control dynamic loss scaling behavior
# dynamic_loss_args={'scale_window' : 500}) # dynamic_loss_args={'scale_window' : 500})
# Usually, dynamic_loss_args is not necessary. # Usually, dynamic_loss_args is not necessary.
Args: Args:
init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option. dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option.
dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
``init_optimizer`` is expected to have been constructed in the ordinary way. ``init_optimizer`` is expected to have been constructed in the ordinary way.
It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
named to replace ``init_optimizer``, for two reasons: named to replace ``init_optimizer``, for two reasons:
First, it means that references to the same name First, it means that references to the same name
later in the file will not have to change. later in the file will not have to change.
Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
modify ``init_optimizer``. If you do choose a unique name for the new modify ``init_optimizer``. If you do choose a unique name for the new
:class:`FP16_Optimizer` instance, you should only work with this new instance, :class:`FP16_Optimizer` instance, you should only work with this new instance,
because the preexisting optimizer might no longer behave as expected. because the preexisting optimizer might no longer behave as expected.
``init_optimizer`` may be any Pytorch optimizer. ``init_optimizer`` may be any Pytorch optimizer.
It may contain a mixture of fp16 and fp32 parameters organized into any number of It may contain a mixture of fp16 and fp32 parameters organized into any number of
``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will ``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will
ingest these ``param_groups`` and remember them. ingest these ``param_groups`` and remember them.
Calls to :: Calls to ::
loss.backward() loss.backward()
must be replaced with :: must be replaced with ::
optimizer.backward(loss) optimizer.backward(loss)
because :class:`FP16_Optimizer` requires ownership of the backward pass to implement because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
loss scaling and copies to master gradients. loss scaling and copies to master gradients.
.. note:: .. note::
Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
are downscaled before being applied. This means that adjusting the loss scale, or using are downscaled before being applied. This means that adjusting the loss scale, or using
dynamic loss scaling, should not require retuning the learning rate or any other dynamic loss scaling, should not require retuning the learning rate or any other
hyperparameters. hyperparameters.
...@@ -152,7 +158,7 @@ class FP16_Optimizer(object): ...@@ -152,7 +158,7 @@ class FP16_Optimizer(object):
See docstring for :attr:`step`. See docstring for :attr:`step`.
**Gradient clipping**: Use :attr:`clip_master_grads`. **Gradient clipping**: Use :attr:`clip_master_grads`.
**Multiple losses**: If your model accumulates gradients from multiple losses, **Multiple losses**: If your model accumulates gradients from multiple losses,
this can be made more efficient by supplying ``update_master_grads=False`` this can be made more efficient by supplying ``update_master_grads=False``
to :attr:`backward`. See docstring for :attr:`backward`. to :attr:`backward`. See docstring for :attr:`backward`.
...@@ -163,19 +169,19 @@ class FP16_Optimizer(object): ...@@ -163,19 +169,19 @@ class FP16_Optimizer(object):
optimizer.loss_scale = new_loss_scale optimizer.loss_scale = new_loss_scale
For static loss scaling, manually adjusting the loss scale over time is a reasonable For static loss scaling, manually adjusting the loss scale over time is a reasonable
thing to do. During later epochs, gradients may become smaller, and a thing to do. During later epochs, gradients may become smaller, and a
higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss
scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
the loss scale is not recommended. the loss scale is not recommended.
**Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in **Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in
Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
should still work as intended. should still work as intended.
""" """
def __init__(self, def __init__(self,
init_optimizer, init_optimizer,
static_loss_scale=1.0, static_loss_scale=1.0,
dynamic_loss_scale=False, dynamic_loss_scale=False,
dynamic_loss_args=None, dynamic_loss_args=None,
verbose=False): verbose=False):
...@@ -212,7 +218,7 @@ class FP16_Optimizer(object): ...@@ -212,7 +218,7 @@ class FP16_Optimizer(object):
# Reset existing state dict key to the new master param. # Reset existing state dict key to the new master param.
# We still need to recast per-param state tensors, if any, to FP32. # We still need to recast per-param state tensors, if any, to FP32.
if param in self.optimizer.state: if param in self.optimizer.state:
self.optimizer.state[master_param] = self.optimizer.state.pop(param) self.optimizer.state[master_param] = self.optimizer.state.pop(param)
elif param.type() == 'torch.cuda.FloatTensor': elif param.type() == 'torch.cuda.FloatTensor':
self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}" self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
.format(param.size())) .format(param.size()))
...@@ -220,9 +226,9 @@ class FP16_Optimizer(object): ...@@ -220,9 +226,9 @@ class FP16_Optimizer(object):
param_group['params'][i] = param param_group['params'][i] = param
else: else:
raise TypeError("Wrapped parameters must be either " raise TypeError("Wrapped parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. " "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type())) "Received {}".format(param.type()))
self.fp16_groups.append(fp16_params_this_group) self.fp16_groups.append(fp16_params_this_group)
self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
self.fp32_from_fp32_groups.append(fp32_params_this_group) self.fp32_from_fp32_groups.append(fp32_params_this_group)
...@@ -250,7 +256,7 @@ class FP16_Optimizer(object): ...@@ -250,7 +256,7 @@ class FP16_Optimizer(object):
def maybe_print(self, msg): def maybe_print(self, msg):
if self.verbose: if self.verbose:
print(msg) print(msg)
def __getstate__(self): def __getstate__(self):
raise RuntimeError("FP16_Optimizer should be serialized using state_dict().") raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
...@@ -265,13 +271,13 @@ class FP16_Optimizer(object): ...@@ -265,13 +271,13 @@ class FP16_Optimizer(object):
# because gradients are copied into the FP32 master params. However, we zero # because gradients are copied into the FP32 master params. However, we zero
# all gradients owned by the optimizer, just to be safe: # all gradients owned by the optimizer, just to be safe:
for group in self.optimizer.param_groups: for group in self.optimizer.param_groups:
for p in group['params']: for p in group['params']:
if set_grads_to_None: if set_grads_to_None:
p.grad = None p.grad = None
else: else:
if p.grad is not None: if p.grad is not None:
p.grad.detach_() p.grad.detach_()
p.grad.zero_() p.grad.zero_()
# Zero fp16 gradients owned by the model: # Zero fp16 gradients owned by the model:
for fp16_group in self.fp16_groups: for fp16_group in self.fp16_groups:
...@@ -280,11 +286,11 @@ class FP16_Optimizer(object): ...@@ -280,11 +286,11 @@ class FP16_Optimizer(object):
param.grad = None param.grad = None
else: else:
if param.grad is not None: if param.grad is not None:
param.grad.detach_() # as in torch.optim.optimizer.zero_grad() param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
param.grad.zero_() param.grad.zero_()
def _check_overflow(self): def _check_overflow(self):
params = [] params = []
for group in self.fp16_groups: for group in self.fp16_groups:
for param in group: for param in group:
params.append(param) params.append(param)
...@@ -304,8 +310,9 @@ class FP16_Optimizer(object): ...@@ -304,8 +310,9 @@ class FP16_Optimizer(object):
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
master_params_to_model_params(fp32_from_fp16_group, fp16_group) master_params_to_model_params(fp32_from_fp16_group, fp16_group)
# To consider: Integrate distributed with this wrapper by registering a hook on each variable # To consider: Integrate distributed with this wrapper by registering a hook on each variable
# that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream. # that does the overflow check, gradient copy + downscale, and fp32
# allreduce in a different stream.
def _model_grads_to_master_grads(self): def _model_grads_to_master_grads(self):
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
model_grads_to_master_grads(fp16_group, fp32_from_fp16_group) model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
...@@ -315,7 +322,7 @@ class FP16_Optimizer(object): ...@@ -315,7 +322,7 @@ class FP16_Optimizer(object):
for group in self.optimizer.param_groups: for group in self.optimizer.param_groups:
for param in group['params']: for param in group['params']:
if param.grad is not None: if param.grad is not None:
param.grad.data.mul_(1./self.loss_scale) param.grad.data.mul_(1. / self.loss_scale)
def clip_master_grads(self, max_norm, norm_type=2): def clip_master_grads(self, max_norm, norm_type=2):
""" """
...@@ -364,9 +371,9 @@ class FP16_Optimizer(object): ...@@ -364,9 +371,9 @@ class FP16_Optimizer(object):
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
""" """
Loads a state_dict created by an earlier call to state_dict(). Loads a state_dict created by an earlier call to state_dict().
If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
whose parameters in turn came from ``model``, it is expected that the user whose parameters in turn came from ``model``, it is expected that the user
will call ``model.load_state_dict()`` before will call ``model.load_state_dict()`` before
``fp16_optimizer_instance.load_state_dict()`` is called. ``fp16_optimizer_instance.load_state_dict()`` is called.
...@@ -387,33 +394,34 @@ class FP16_Optimizer(object): ...@@ -387,33 +394,34 @@ class FP16_Optimizer(object):
self.first_closure_call_this_step = state_dict['first_closure_call_this_step'] self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
# At this point, the optimizer's references to the model's fp32 parameters are up to date. # At this point, the optimizer's references to the model's fp32 parameters are up to date.
# The optimizer's hyperparameters and internal buffers are also up to date. # The optimizer's hyperparameters and internal buffers are also up to date.
# However, the fp32 master copies of the model's fp16 params stored by the optimizer are still # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
# out of date. There are two options. # out of date. There are two options.
# 1: Refresh the master params from the model's fp16 params. # 1: Refresh the master params from the model's fp16 params.
# This requires less storage but incurs precision loss. # This requires less storage but incurs precision loss.
# 2: Save and restore the fp32 master copies separately. # 2: Save and restore the fp32 master copies separately.
# We choose option 2. # We choose option 2.
# #
# Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
# of their associated parameters, because it's possible those buffers might not exist yet in # of their associated parameters, because it's possible those buffers might not exist yet in
# the current optimizer instance. In our case, as long as the current FP16_Optimizer has been # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been
# constructed in the same way as the one whose state_dict we are loading, the same master params # constructed in the same way as the one whose state_dict we are loading, the same master params
# are guaranteed to exist, so we can just copy_() from the saved master params. # are guaranteed to exist, so we can just copy_() from the saved master params.
for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']): for current_group, saved_group in zip(
self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
for current, saved in zip(current_group, saved_group): for current, saved in zip(current_group, saved_group):
current.data.copy_(saved.data) current.data.copy_(saved.data)
def step(self, closure=None): # could add clip option. def step(self, closure=None): # could add clip option.
""" """
If no closure is supplied, :attr:`step` should be called after If no closure is supplied, :attr:`step` should be called after
``fp16_optimizer_obj.backward(loss)``. ``fp16_optimizer_obj.backward(loss)``.
:attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
:class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
another forward pass using their model. another forward pass using their model.
If a closure is supplied, :attr:`step` may be called without a prior call to If a closure is supplied, :attr:`step` may be called without a prior call to
:attr:`backward(loss)`. :attr:`backward(loss)`.
This control flow is identical to `ordinary Pytorch optimizer use`_ with closures. This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
However, the user should take care that any ``loss.backward()`` call within the closure However, the user should take care that any ``loss.backward()`` call within the closure
...@@ -424,7 +432,7 @@ class FP16_Optimizer(object): ...@@ -424,7 +432,7 @@ class FP16_Optimizer(object):
Example with closure:: Example with closure::
# optimizer is assumed to be an FP16_Optimizer object, previously constructed from an # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
# existing pytorch optimizer. # existing pytorch optimizer.
for input, target in dataset: for input, target in dataset:
def closure(): def closure():
...@@ -448,9 +456,9 @@ class FP16_Optimizer(object): ...@@ -448,9 +456,9 @@ class FP16_Optimizer(object):
if self.overflow: if self.overflow:
self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}" self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
.format(scale, self.loss_scale)) .format(scale, self.loss_scale))
return return
if closure is not None: if closure is not None:
retval = self._step_with_closure(closure) retval = self._step_with_closure(closure)
else: else:
...@@ -472,7 +480,7 @@ class FP16_Optimizer(object): ...@@ -472,7 +480,7 @@ class FP16_Optimizer(object):
self.first_closure_call_this_step = False self.first_closure_call_this_step = False
else: else:
# If self.optimizer.step() internally calls wrapped_closure more than once, # If self.optimizer.step() internally calls wrapped_closure more than once,
# it may update the fp32 params after each call. However, self.optimizer # it may update the fp32 params after each call. However, self.optimizer
# doesn't know about the fp16 params at all. If the fp32 params get updated, # doesn't know about the fp16 params at all. If the fp32 params get updated,
# we can't rely on self.optimizer to refresh the fp16 params. We need # we can't rely on self.optimizer to refresh the fp16 params. We need
# to handle that manually: # to handle that manually:
...@@ -480,16 +488,16 @@ class FP16_Optimizer(object): ...@@ -480,16 +488,16 @@ class FP16_Optimizer(object):
# Our API expects the user to give us ownership of the backward() call by # Our API expects the user to give us ownership of the backward() call by
# replacing all calls to loss.backward() with optimizer.backward(loss). # replacing all calls to loss.backward() with optimizer.backward(loss).
# This requirement holds whether or not the call to backward() is made within a closure. # This requirement holds whether or not the call to backward() is made within a closure.
# If the user is properly calling optimizer.backward(loss) within "closure," # If the user is properly calling optimizer.backward(loss) within "closure,"
# calling closure() here will give the fp32 master params fresh gradients # calling closure() here will give the fp32 master params fresh gradients
# for the optimizer to play with, so all wrapped_closure needs to do is call # for the optimizer to play with, so all wrapped_closure needs to do is call
# closure() and return the loss. # closure() and return the loss.
temp_loss = closure() temp_loss = closure()
while(self.overflow): while(self.overflow):
scale = self.loss_scaler.loss_scale scale = self.loss_scaler.loss_scale
self._update_scale(self.overflow) self._update_scale(self.overflow)
self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, " self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
"reducing to {}".format(scale, self.loss_scale)) "reducing to {}".format(scale, self.loss_scale))
temp_loss = closure() temp_loss = closure()
return temp_loss return temp_loss
...@@ -500,7 +508,7 @@ class FP16_Optimizer(object): ...@@ -500,7 +508,7 @@ class FP16_Optimizer(object):
return retval return retval
def backward(self, loss, update_master_grads=True, retain_graph=False): def backward(self, loss, update_master_grads=True, retain_graph=False):
""" """
:attr:`backward` performs the following conceptual steps: :attr:`backward` performs the following conceptual steps:
1. fp32_loss = loss.float() (see first Note below) 1. fp32_loss = loss.float() (see first Note below)
...@@ -514,19 +522,19 @@ class FP16_Optimizer(object): ...@@ -514,19 +522,19 @@ class FP16_Optimizer(object):
.. note:: .. note::
:attr:`backward` internally converts the loss to fp32 before applying the loss scale. :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
This provides some additional safety against overflow if the user has supplied an This provides some additional safety against overflow if the user has supplied an
fp16 loss value. fp16 loss value.
However, for maximum overflow safety, the user should However, for maximum overflow safety, the user should
compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
:attr:`backward`. :attr:`backward`.
.. warning:: .. warning::
The gradients found in a model's leaves after the call to The gradients found in a model's leaves after the call to
:attr:`backward` should not be regarded as valid in general, :attr:`backward` should not be regarded as valid in general,
because it's possible because it's possible
they have been scaled (and in the case of dynamic loss scaling, they have been scaled (and in the case of dynamic loss scaling,
the scale factor may change over time). the scale factor may change over time).
If the user wants to inspect gradients after a call to :attr:`backward`, If the user wants to inspect gradients after a call to :attr:`backward`,
only the master gradients should be regarded as valid. These can be retrieved via only the master gradients should be regarded as valid. These can be retrieved via
:attr:`inspect_master_grad_data()`. :attr:`inspect_master_grad_data()`.
...@@ -541,54 +549,55 @@ class FP16_Optimizer(object): ...@@ -541,54 +549,55 @@ class FP16_Optimizer(object):
optimizer.backward(loss) optimizer.backward(loss)
# Naive operation with multiple losses (technically valid, but less efficient): # Naive operation with multiple losses (technically valid, but less efficient):
# fp32 grads will be correct after the second call, but # fp32 grads will be correct after the second call, but
# the first call incurs an unnecessary fp16->fp32 grad copy. # the first call incurs an unnecessary fp16->fp32 grad copy.
optimizer.backward(loss1) optimizer.backward(loss1)
optimizer.backward(loss2) optimizer.backward(loss2)
# More efficient way to handle multiple losses: # More efficient way to handle multiple losses:
# The fp16->fp32 grad copy is delayed until fp16 grads from all # The fp16->fp32 grad copy is delayed until fp16 grads from all
# losses have been accumulated. # losses have been accumulated.
optimizer.backward(loss1, update_master_grads=False) optimizer.backward(loss1, update_master_grads=False)
optimizer.backward(loss2, update_master_grads=False) optimizer.backward(loss2, update_master_grads=False)
optimizer.update_master_grads() optimizer.update_master_grads()
""" """
# To consider: try multiple backward passes using retain_grad=True to find # To consider: try multiple backward passes using retain_grad=True to find
# a loss scale that works. After you find a loss scale that works, do a final dummy # a loss scale that works. After you find a loss scale that works, do a final dummy
# backward pass with retain_graph=False to tear down the graph. Doing this would avoid # backward pass with retain_graph=False to tear down the graph. Doing this would avoid
# discarding the iteration, but probably wouldn't improve overall efficiency. # discarding the iteration, but probably wouldn't improve overall efficiency.
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
if update_master_grads: if update_master_grads:
self.update_master_grads() self.update_master_grads()
def update_master_grads(self): def update_master_grads(self):
""" """
Copy the ``.grad`` attribute from stored references to fp16 parameters to Copy the ``.grad`` attribute from stored references to fp16 parameters to
the ``.grad`` attribute of the fp32 master parameters that are directly the ``.grad`` attribute of the fp32 master parameters that are directly
updated by the optimizer. :attr:`update_master_grads` only needs to be called if updated by the optimizer. :attr:`update_master_grads` only needs to be called if
``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``. ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
""" """
if self.dynamic_loss_scale: if self.dynamic_loss_scale:
self._check_overflow() self._check_overflow()
if self.overflow: return if self.overflow:
return
self._model_grads_to_master_grads() self._model_grads_to_master_grads()
self._downscale_master() self._downscale_master()
def inspect_master_grad_data(self): def inspect_master_grad_data(self):
""" """
When running with :class:`FP16_Optimizer`, When running with :class:`FP16_Optimizer`,
``.grad`` attributes of a model's fp16 leaves should not be ``.grad`` attributes of a model's fp16 leaves should not be
regarded as truthful, because they might be scaled. regarded as truthful, because they might be scaled.
After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered, After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
the fp32 master params' ``.grad`` the fp32 master params' ``.grad``
attributes will contain valid gradients properly divided by the loss scale. However, attributes will contain valid gradients properly divided by the loss scale. However,
because :class:`FP16_Optimizer` flattens some parameters, accessing them may be because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
nonintuitive. :attr:`inspect_master_grad_data` nonintuitive. :attr:`inspect_master_grad_data`
allows those gradients to be viewed with shapes corresponding to their associated model leaves. allows those gradients to be viewed with shapes corresponding to their associated model leaves.
Returns: Returns:
List of lists (one list for each parameter group). The list for each parameter group List of lists (one list for each parameter group). The list for each parameter group
is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group. is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
""" """
if self.overflow: if self.overflow:
print("Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. " print("Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. "
...@@ -607,8 +616,8 @@ class FP16_Optimizer(object): ...@@ -607,8 +616,8 @@ class FP16_Optimizer(object):
master_grads_data.append(master_grads_this_group) master_grads_data.append(master_grads_this_group)
return master_grads_data return master_grads_data
# Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale" # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
def _get_loss_scale(self): def _get_loss_scale(self):
return self.loss_scaler.loss_scale return self.loss_scaler.loss_scale
......
...@@ -102,6 +102,7 @@ class FP16Model(nn.Module): ...@@ -102,6 +102,7 @@ class FP16Model(nn.Module):
def backwards_debug_hook(grad): def backwards_debug_hook(grad):
raise RuntimeError("master_params recieved a gradient in the backward pass!") raise RuntimeError("master_params recieved a gradient in the backward pass!")
def prep_param_lists(model, flat_master=False): def prep_param_lists(model, flat_master=False):
""" """
Creates a list of FP32 master parameters for a given model, as in Creates a list of FP32 master parameters for a given model, as in
...@@ -131,9 +132,9 @@ def prep_param_lists(model, flat_master=False): ...@@ -131,9 +132,9 @@ def prep_param_lists(model, flat_master=False):
# flatten_dense_tensors returns a contiguous flat array. # flatten_dense_tensors returns a contiguous flat array.
# http://pytorch.org/docs/master/_modules/torch/_utils.html # http://pytorch.org/docs/master/_modules/torch/_utils.html
master_params = _flatten_dense_tensors([param.data for param in model_params]).float() master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
except: except BaseException:
print("Error in prep_param_lists: model may contain a mixture of parameters " print("Error in prep_param_lists: model may contain a mixture of parameters "
"of different types. Use flat_master=False, or use F16_Optimizer.") "of different types. Use flat_master=False, or use F16_Optimizer.")
raise raise
master_params = torch.nn.Parameter(master_params) master_params = torch.nn.Parameter(master_params)
master_params.requires_grad = True master_params.requires_grad = True
...@@ -150,7 +151,7 @@ def prep_param_lists(model, flat_master=False): ...@@ -150,7 +151,7 @@ def prep_param_lists(model, flat_master=False):
def model_grads_to_master_grads(model_params, master_params, flat_master=False): def model_grads_to_master_grads(model_params, master_params, flat_master=False):
""" """
Copy model gradients to master gradients. Copy model gradients to master gradients.
Args: Args:
model_params: List of model parameters created by :func:`prep_param_lists`. model_params: List of model parameters created by :func:`prep_param_lists`.
...@@ -179,7 +180,7 @@ def master_params_to_model_params(model_params, master_params, flat_master=False ...@@ -179,7 +180,7 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
""" """
if flat_master: if flat_master:
for model, master in zip(model_params, for model, master in zip(model_params,
_unflatten_dense_tensors(master_params[0].data, model_params)): _unflatten_dense_tensors(master_params[0].data, model_params)):
model.data.copy_(master) model.data.copy_(master)
else: else:
...@@ -188,17 +189,19 @@ def master_params_to_model_params(model_params, master_params, flat_master=False ...@@ -188,17 +189,19 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
# Backward compatibility fixes # Backward compatibility fixes
def to_python_float(t): def to_python_float(t):
if hasattr(t, 'item'): if hasattr(t, 'item'):
return t.item() return t.item()
else: else:
return t[0] return t[0]
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1]) TORCH_MINOR = int(torch.__version__.split('.')[1])
clip_grad_norm = mpu.clip_grad_norm clip_grad_norm = mpu.clip_grad_norm
#elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4: # elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
# clip_grad_norm = torch.nn.utils.clip_grad_norm # clip_grad_norm = torch.nn.utils.clip_grad_norm
#else: # else:
# clip_grad_norm = torch.nn.utils.clip_grad_norm_ # clip_grad_norm = torch.nn.utils.clip_grad_norm_
...@@ -17,18 +17,21 @@ import torch ...@@ -17,18 +17,21 @@ import torch
from megatron import mpu from megatron import mpu
# item() is a recent addition, so this helps with backward compatibility. # item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t): def to_python_float(t):
if hasattr(t, 'item'): if hasattr(t, 'item'):
return t.item() return t.item()
else: else:
return t[0] return t[0]
class LossScaler: class LossScaler:
""" """
Class that manages a static loss scale. This class is intended to interact with Class that manages a static loss scale. This class is intended to interact with
:class:`FP16_Optimizer`, and should not be directly manipulated by the user. :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
:class:`FP16_Optimizer`'s constructor. :class:`FP16_Optimizer`'s constructor.
Args: Args:
...@@ -57,13 +60,14 @@ class LossScaler: ...@@ -57,13 +60,14 @@ class LossScaler:
return tuple(self.loss_scale * g for g in grad_in) return tuple(self.loss_scale * g for g in grad_in)
def backward(self, loss, retain_graph=False): def backward(self, loss, retain_graph=False):
scaled_loss = loss*self.loss_scale scaled_loss = loss * self.loss_scale
scaled_loss.backward(retain_graph=retain_graph) scaled_loss.backward(retain_graph=retain_graph)
class DynamicLossScaler: class DynamicLossScaler:
""" """
Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler`
indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
:class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler`
operates, because the default options can be changed using the operates, because the default options can be changed using the
the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
...@@ -71,18 +75,18 @@ class DynamicLossScaler: ...@@ -71,18 +75,18 @@ class DynamicLossScaler:
Loss scaling is designed to combat the problem of underflowing gradients encountered at long Loss scaling is designed to combat the problem of underflowing gradients encountered at long
times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss
scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are
encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
occurred. occurred.
:class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
If a certain number of iterations occur without overflowing gradients detected, If a certain number of iterations occur without overflowing gradients detected,
:class:`DynamicLossScaler` increases the loss scale once more. :class:`DynamicLossScaler` increases the loss scale once more.
In this way :class:`DynamicLossScaler` attempts to "ride the edge" of In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
always using the highest loss scale possible without incurring overflow. always using the highest loss scale possible without incurring overflow.
Args: Args:
init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.`
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
""" """
...@@ -122,12 +126,12 @@ class DynamicLossScaler: ...@@ -122,12 +126,12 @@ class DynamicLossScaler:
overflow = overflow_gpu[0].item() overflow = overflow_gpu[0].item()
return bool(overflow) return bool(overflow)
# `x` is a torch.Tensor # `x` is a torch.Tensor
def _has_inf_or_nan(x): def _has_inf_or_nan(x):
try: try:
# if x is half, the .float() incurs an additional deep copy, but it's necessary if # if x is half, the .float() incurs an additional deep copy, but it's necessary if
# Pytorch's .sum() creates a one-element tensor of the same type as x # Pytorch's .sum() creates a one-element tensor of the same type as x
# (which is true for some recent version of pytorch). # (which is true for some recent version of pytorch).
cpu_sum = float(x.float().sum()) cpu_sum = float(x.float().sum())
# More efficient version that can be used if .sum() returns a Python scalar # More efficient version that can be used if .sum() returns a Python scalar
...@@ -158,7 +162,7 @@ class DynamicLossScaler: ...@@ -158,7 +162,7 @@ class DynamicLossScaler:
if overflow: if overflow:
# self.cur_scale /= self.scale_factor # self.cur_scale /= self.scale_factor
if self.delayed_shift == 1 or self.cur_hysteresis == 1: if self.delayed_shift == 1 or self.cur_hysteresis == 1:
self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale) self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
else: else:
self.cur_hysteresis -= 1 self.cur_hysteresis -= 1
self.last_overflow_iter = self.cur_iter self.last_overflow_iter = self.cur_iter
...@@ -179,10 +183,11 @@ class DynamicLossScaler: ...@@ -179,10 +183,11 @@ class DynamicLossScaler:
return tuple(self.loss_scale * g for g in grad_in) return tuple(self.loss_scale * g for g in grad_in)
def backward(self, loss, retain_graph=False): def backward(self, loss, retain_graph=False):
scaled_loss = loss*self.loss_scale scaled_loss = loss * self.loss_scale
scaled_loss.backward(retain_graph=retain_graph) scaled_loss.backward(retain_graph=retain_graph)
##############################################################
##############################################################
# Example usage below here -- assuming it's in a separate file # Example usage below here -- assuming it's in a separate file
############################################################## ##############################################################
""" """
...@@ -218,10 +223,10 @@ if __name__ == "__main__": ...@@ -218,10 +223,10 @@ if __name__ == "__main__":
# Run backprop # Run backprop
optimizer.zero_grad() optimizer.zero_grad()
loss.backward() loss.backward()
# Check for overflow # Check for overflow
has_overflow = DynamicLossScaler.has_overflow(parameters) has_overflow = DynamicLossScaler.has_overflow(parameters)
# If no overflow, unscale grad and update as usual # If no overflow, unscale grad and update as usual
if not has_overflow: if not has_overflow:
for param in parameters: for param in parameters:
......
...@@ -124,7 +124,7 @@ def _set_adlr_autoresume(args): ...@@ -124,7 +124,7 @@ def _set_adlr_autoresume(args):
sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.')) sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
try: try:
from userlib.auto_resume import AutoResume from userlib.auto_resume import AutoResume
except: except BaseException:
print('ADLR autoresume is not available, exiting ...') print('ADLR autoresume is not available, exiting ...')
sys.exit() sys.exit()
......
...@@ -48,7 +48,6 @@ class AnnealingLR(object): ...@@ -48,7 +48,6 @@ class AnnealingLR(object):
print_rank_0('> learning rate decay style: {}'.format(self.decay_style)) print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
def get_lr(self): def get_lr(self):
"""Learning rate decay functions from: """Learning rate decay functions from:
https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
...@@ -71,7 +70,6 @@ class AnnealingLR(object): ...@@ -71,7 +70,6 @@ class AnnealingLR(object):
lr = self.start_lr lr = self.start_lr
return max(lr, self.min_lr) return max(lr, self.min_lr)
def step(self, step_num=None): def step(self, step_num=None):
"""Set lr for all parameters groups.""" """Set lr for all parameters groups."""
if step_num is None: if step_num is None:
...@@ -81,7 +79,6 @@ class AnnealingLR(object): ...@@ -81,7 +79,6 @@ class AnnealingLR(object):
for group in self.optimizer.param_groups: for group in self.optimizer.param_groups:
group['lr'] = new_lr group['lr'] = new_lr
def state_dict(self): def state_dict(self):
state_dict = { state_dict = {
'start_lr': self.start_lr, 'start_lr': self.start_lr,
...@@ -93,7 +90,6 @@ class AnnealingLR(object): ...@@ -93,7 +90,6 @@ class AnnealingLR(object):
} }
return state_dict return state_dict
def _check_and_set(self, cls_value, sd_value, name): def _check_and_set(self, cls_value, sd_value, name):
"""Auxiliary function for checking the values in the checkpoint and """Auxiliary function for checking the values in the checkpoint and
setting them.""" setting them."""
...@@ -108,7 +104,6 @@ class AnnealingLR(object): ...@@ -108,7 +104,6 @@ class AnnealingLR(object):
name)) name))
return sd_value return sd_value
def load_state_dict(self, sd): def load_state_dict(self, sd):
self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'], self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
......
...@@ -66,7 +66,6 @@ def bert_position_ids(token_ids): ...@@ -66,7 +66,6 @@ def bert_position_ids(token_ids):
return position_ids return position_ids
class BertLMHead(MegatronModule): class BertLMHead(MegatronModule):
"""Masked LM head for Bert """Masked LM head for Bert
...@@ -77,6 +76,7 @@ class BertLMHead(MegatronModule): ...@@ -77,6 +76,7 @@ class BertLMHead(MegatronModule):
layernorm_epsilon: tolerance for layer norm divisions layernorm_epsilon: tolerance for layer norm divisions
parallel_output: wether output logits being distributed or not. parallel_output: wether output logits being distributed or not.
""" """
def __init__(self, mpu_vocab_size, hidden_size, init_method, def __init__(self, mpu_vocab_size, hidden_size, init_method,
layernorm_epsilon, parallel_output): layernorm_epsilon, parallel_output):
...@@ -91,7 +91,6 @@ class BertLMHead(MegatronModule): ...@@ -91,7 +91,6 @@ class BertLMHead(MegatronModule):
self.dense = get_linear_layer(hidden_size, hidden_size, init_method) self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
def forward(self, hidden_states, word_embeddings_weight): def forward(self, hidden_states, word_embeddings_weight):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
hidden_states = gelu(hidden_states) hidden_states = gelu(hidden_states)
...@@ -103,7 +102,6 @@ class BertLMHead(MegatronModule): ...@@ -103,7 +102,6 @@ class BertLMHead(MegatronModule):
return output return output
class BertModel(MegatronModule): class BertModel(MegatronModule):
"""Bert Language model.""" """Bert Language model."""
...@@ -136,7 +134,6 @@ class BertModel(MegatronModule): ...@@ -136,7 +134,6 @@ class BertModel(MegatronModule):
init_method) init_method)
self._binary_head_key = 'binary_head' self._binary_head_key = 'binary_head'
def forward(self, input_ids, attention_mask, tokentype_ids=None): def forward(self, input_ids, attention_mask, tokentype_ids=None):
extended_attention_mask = bert_extended_attention_mask( extended_attention_mask = bert_extended_attention_mask(
...@@ -166,7 +163,6 @@ class BertModel(MegatronModule): ...@@ -166,7 +163,6 @@ class BertModel(MegatronModule):
return lm_logits, None return lm_logits, None
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
"""For easy load when model is combined with other heads, """For easy load when model is combined with other heads,
...@@ -184,7 +180,6 @@ class BertModel(MegatronModule): ...@@ -184,7 +180,6 @@ class BertModel(MegatronModule):
= self.binary_head.state_dict(destination, prefix, keep_vars) = self.binary_head.state_dict(destination, prefix, keep_vars)
return state_dict_ return state_dict_
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
"""Customized load.""" """Customized load."""
......
...@@ -53,7 +53,6 @@ class Classification(MegatronModule): ...@@ -53,7 +53,6 @@ class Classification(MegatronModule):
init_method) init_method)
self._classification_head_key = 'classification_head' self._classification_head_key = 'classification_head'
def forward(self, input_ids, attention_mask, tokentype_ids): def forward(self, input_ids, attention_mask, tokentype_ids):
extended_attention_mask = bert_extended_attention_mask( extended_attention_mask = bert_extended_attention_mask(
...@@ -74,7 +73,6 @@ class Classification(MegatronModule): ...@@ -74,7 +73,6 @@ class Classification(MegatronModule):
return classification_logits return classification_logits
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
"""For easy load when model is combined with other heads, """For easy load when model is combined with other heads,
...@@ -89,7 +87,6 @@ class Classification(MegatronModule): ...@@ -89,7 +87,6 @@ class Classification(MegatronModule):
destination, prefix, keep_vars) destination, prefix, keep_vars)
return state_dict_ return state_dict_
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
"""Customized load.""" """Customized load."""
......
...@@ -71,8 +71,8 @@ class DistributedDataParallel(MegatronModule): ...@@ -71,8 +71,8 @@ class DistributedDataParallel(MegatronModule):
def allreduce_hook(*unused): def allreduce_hook(*unused):
Variable._execution_engine.queue_callback(allreduce_params) Variable._execution_engine.queue_callback(allreduce_params)
# handle = param.register_hook(allreduce_hook) # handle = param.register_hook(allreduce_hook)
#self.hooks.append(allreduce_hook) # self.hooks.append(allreduce_hook)
#self.hook_handles.append(handle) # self.hook_handles.append(handle)
self.allreduce_params = allreduce_params self.allreduce_params = allreduce_params
def forward(self, *inputs, **kwargs): def forward(self, *inputs, **kwargs):
...@@ -114,4 +114,3 @@ class DistributedDataParallel(MegatronModule): ...@@ -114,4 +114,3 @@ class DistributedDataParallel(MegatronModule):
super(DistributedDataParallel, self).train(mode) super(DistributedDataParallel, self).train(mode)
self.module.train(mode) self.module.train(mode)
''' '''
...@@ -28,7 +28,7 @@ from .utils import scaled_init_method_normal ...@@ -28,7 +28,7 @@ from .utils import scaled_init_method_normal
def gpt2_attention_mask_func(attention_scores, ltor_mask): def gpt2_attention_mask_func(attention_scores, ltor_mask):
attention_scores = torch.mul(attention_scores, ltor_mask) - \ attention_scores = torch.mul(attention_scores, ltor_mask) - \
10000.0 * (1.0 - ltor_mask) 10000.0 * (1.0 - ltor_mask)
return attention_scores return attention_scores
...@@ -49,7 +49,6 @@ class GPT2Model(MegatronModule): ...@@ -49,7 +49,6 @@ class GPT2Model(MegatronModule):
scaled_init_method=scaled_init_method_normal(args.init_method_std, scaled_init_method=scaled_init_method_normal(args.init_method_std,
args.num_layers)) args.num_layers))
def forward(self, input_ids, position_ids, attention_mask, def forward(self, input_ids, position_ids, attention_mask,
tokentype_ids=None, layer_past=None, get_key_value=False, tokentype_ids=None, layer_past=None, get_key_value=False,
forward_method_parallel_output=None): forward_method_parallel_output=None):
...@@ -79,7 +78,6 @@ class GPT2Model(MegatronModule): ...@@ -79,7 +78,6 @@ class GPT2Model(MegatronModule):
return output return output
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
...@@ -89,7 +87,6 @@ class GPT2Model(MegatronModule): ...@@ -89,7 +87,6 @@ class GPT2Model(MegatronModule):
destination, prefix, keep_vars) destination, prefix, keep_vars)
return state_dict_ return state_dict_
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
"""Customized load.""" """Customized load."""
......
...@@ -62,7 +62,6 @@ def get_language_model(attention_mask_func, num_tokentypes, add_pooler, ...@@ -62,7 +62,6 @@ def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
return language_model, language_model_key return language_model, language_model_key
class Pooler(MegatronModule): class Pooler(MegatronModule):
"""Pooler layer. """Pooler layer.
...@@ -74,11 +73,11 @@ class Pooler(MegatronModule): ...@@ -74,11 +73,11 @@ class Pooler(MegatronModule):
init_method: weight initialization method for the linear layer. init_method: weight initialization method for the linear layer.
bias is set to zero. bias is set to zero.
""" """
def __init__(self, hidden_size, init_method): def __init__(self, hidden_size, init_method):
super(Pooler, self).__init__() super(Pooler, self).__init__()
self.dense = get_linear_layer(hidden_size, hidden_size, init_method) self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
def forward(self, hidden_states, sequence_index=0): def forward(self, hidden_states, sequence_index=0):
# hidden_states: [b, s, h] # hidden_states: [b, s, h]
# sequence_index: index of the token to pool. # sequence_index: index of the token to pool.
...@@ -101,6 +100,7 @@ class Embedding(MegatronModule): ...@@ -101,6 +100,7 @@ class Embedding(MegatronModule):
num_tokentypes: size of the token-type embeddings. 0 value num_tokentypes: size of the token-type embeddings. 0 value
will ignore this embedding will ignore this embedding
""" """
def __init__(self, def __init__(self,
hidden_size, hidden_size,
vocab_size, vocab_size,
...@@ -142,7 +142,6 @@ class Embedding(MegatronModule): ...@@ -142,7 +142,6 @@ class Embedding(MegatronModule):
# Embeddings dropout # Embeddings dropout
self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
def add_tokentype_embeddings(self, num_tokentypes): def add_tokentype_embeddings(self, num_tokentypes):
"""Add token-type embedding. This function is provided so we can add """Add token-type embedding. This function is provided so we can add
token-type embeddings in case the pretrained model does not have it. token-type embeddings in case the pretrained model does not have it.
...@@ -159,7 +158,6 @@ class Embedding(MegatronModule): ...@@ -159,7 +158,6 @@ class Embedding(MegatronModule):
# Initialize the token-type embeddings. # Initialize the token-type embeddings.
self.init_method(self.tokentype_embeddings.weight) self.init_method(self.tokentype_embeddings.weight)
def forward(self, input_ids, position_ids, tokentype_ids=None): def forward(self, input_ids, position_ids, tokentype_ids=None):
# Embeddings. # Embeddings.
words_embeddings = self.word_embeddings(input_ids) words_embeddings = self.word_embeddings(input_ids)
...@@ -176,7 +174,6 @@ class Embedding(MegatronModule): ...@@ -176,7 +174,6 @@ class Embedding(MegatronModule):
return embeddings return embeddings
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
"""For easy load.""" """For easy load."""
...@@ -194,7 +191,6 @@ class Embedding(MegatronModule): ...@@ -194,7 +191,6 @@ class Embedding(MegatronModule):
return state_dict_ return state_dict_
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
"""Customized load.""" """Customized load."""
...@@ -223,7 +219,7 @@ class Embedding(MegatronModule): ...@@ -223,7 +219,7 @@ class Embedding(MegatronModule):
self.position_embeddings.load_state_dict(state_dict_, strict=strict) self.position_embeddings.load_state_dict(state_dict_, strict=strict)
# Tokentype embedding. # Tokentype embedding.
if self.num_tokentypes > 0: if self.num_tokentypes > 0:
state_dict_ = {} state_dict_ = {}
if self._tokentype_embeddings_key in state_dict: if self._tokentype_embeddings_key in state_dict:
state_dict_ = state_dict[self._tokentype_embeddings_key] state_dict_ = state_dict[self._tokentype_embeddings_key]
...@@ -241,7 +237,6 @@ class Embedding(MegatronModule): ...@@ -241,7 +237,6 @@ class Embedding(MegatronModule):
'checkpoint but could not find it', flush=True) 'checkpoint but could not find it', flush=True)
class TransformerLanguageModel(MegatronModule): class TransformerLanguageModel(MegatronModule):
"""Transformer language model. """Transformer language model.
...@@ -260,6 +255,7 @@ class TransformerLanguageModel(MegatronModule): ...@@ -260,6 +255,7 @@ class TransformerLanguageModel(MegatronModule):
num_tokentypes: size of the token-type embeddings. 0 value num_tokentypes: size of the token-type embeddings. 0 value
will ignore this embedding will ignore this embedding
""" """
def __init__(self, def __init__(self,
attention_mask_func, attention_mask_func,
mlp_activation_func, mlp_activation_func,
...@@ -295,7 +291,6 @@ class TransformerLanguageModel(MegatronModule): ...@@ -295,7 +291,6 @@ class TransformerLanguageModel(MegatronModule):
self.pooler = Pooler(self.hidden_size, self.init_method) self.pooler = Pooler(self.hidden_size, self.init_method)
self._pooler_key = 'pooler' self._pooler_key = 'pooler'
def forward(self, input_ids, position_ids, attention_mask, def forward(self, input_ids, position_ids, attention_mask,
tokentype_ids=None, layer_past=None, get_key_value=False, tokentype_ids=None, layer_past=None, get_key_value=False,
pooling_sequence_index=0): pooling_sequence_index=0):
...@@ -317,7 +312,6 @@ class TransformerLanguageModel(MegatronModule): ...@@ -317,7 +312,6 @@ class TransformerLanguageModel(MegatronModule):
return transformer_output return transformer_output
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
"""For easy load.""" """For easy load."""
...@@ -336,7 +330,6 @@ class TransformerLanguageModel(MegatronModule): ...@@ -336,7 +330,6 @@ class TransformerLanguageModel(MegatronModule):
return state_dict_ return state_dict_
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
"""Customized load.""" """Customized load."""
......
...@@ -51,7 +51,6 @@ class MultipleChoice(MegatronModule): ...@@ -51,7 +51,6 @@ class MultipleChoice(MegatronModule):
init_method) init_method)
self._multichoice_head_key = 'multichoice_head' self._multichoice_head_key = 'multichoice_head'
def forward(self, input_ids, attention_mask, tokentype_ids): def forward(self, input_ids, attention_mask, tokentype_ids):
# [batch, choices, sequence] --> [batch * choices, sequence] --> # [batch, choices, sequence] --> [batch * choices, sequence] -->
...@@ -86,7 +85,6 @@ class MultipleChoice(MegatronModule): ...@@ -86,7 +85,6 @@ class MultipleChoice(MegatronModule):
return multichoice_logits return multichoice_logits
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
"""For easy load when model is combined with other heads, """For easy load when model is combined with other heads,
...@@ -101,7 +99,6 @@ class MultipleChoice(MegatronModule): ...@@ -101,7 +99,6 @@ class MultipleChoice(MegatronModule):
destination, prefix, keep_vars) destination, prefix, keep_vars)
return state_dict_ return state_dict_
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
"""Customized load.""" """Customized load."""
......
...@@ -46,6 +46,7 @@ from megatron.module import MegatronModule ...@@ -46,6 +46,7 @@ from megatron.module import MegatronModule
unmaksed-attention-scores, attention-mask) unmaksed-attention-scores, attention-mask)
""" """
class ParallelMLP(MegatronModule): class ParallelMLP(MegatronModule):
"""MLP. """MLP.
...@@ -63,7 +64,7 @@ class ParallelMLP(MegatronModule): ...@@ -63,7 +64,7 @@ class ParallelMLP(MegatronModule):
# Project to 4h. # Project to 4h.
self.dense_h_to_4h = mpu.ColumnParallelLinear( self.dense_h_to_4h = mpu.ColumnParallelLinear(
args.hidden_size, args.hidden_size,
4*args.hidden_size, 4 * args.hidden_size,
gather_output=False, gather_output=False,
init_method=init_method) init_method=init_method)
...@@ -71,14 +72,13 @@ class ParallelMLP(MegatronModule): ...@@ -71,14 +72,13 @@ class ParallelMLP(MegatronModule):
# Project back to h. # Project back to h.
self.dense_4h_to_h = mpu.RowParallelLinear( self.dense_4h_to_h = mpu.RowParallelLinear(
4*args.hidden_size, 4 * args.hidden_size,
args.hidden_size, args.hidden_size,
input_is_parallel=True, input_is_parallel=True,
init_method=output_layer_init_method) init_method=output_layer_init_method)
self.dropout = torch.nn.Dropout(args.hidden_dropout) self.dropout = torch.nn.Dropout(args.hidden_dropout)
def forward(self, hidden_states): def forward(self, hidden_states):
# [b, s, 4hp] # [b, s, 4hp]
...@@ -91,13 +91,13 @@ class ParallelMLP(MegatronModule): ...@@ -91,13 +91,13 @@ class ParallelMLP(MegatronModule):
return output return output
class ParallelSelfAttention(MegatronModule): class ParallelSelfAttention(MegatronModule):
"""Parallel self-attention layer abstract class. """Parallel self-attention layer abstract class.
Self-attention layer takes input with size [b, s, h] Self-attention layer takes input with size [b, s, h]
and returns output of the same size. and returns output of the same size.
""" """
def __init__(self, attention_mask_func, init_method, def __init__(self, attention_mask_func, init_method,
output_layer_init_method, layer_number): output_layer_init_method, layer_number):
super(ParallelSelfAttention, self).__init__() super(ParallelSelfAttention, self).__init__()
...@@ -123,7 +123,7 @@ class ParallelSelfAttention(MegatronModule): ...@@ -123,7 +123,7 @@ class ParallelSelfAttention(MegatronModule):
# Strided linear layer. # Strided linear layer.
self.query_key_value = mpu.ColumnParallelLinear( self.query_key_value = mpu.ColumnParallelLinear(
args.hidden_size, args.hidden_size,
3*args.hidden_size, 3 * args.hidden_size,
stride=3, stride=3,
gather_output=False, gather_output=False,
init_method=init_method) init_method=init_method)
...@@ -141,18 +141,16 @@ class ParallelSelfAttention(MegatronModule): ...@@ -141,18 +141,16 @@ class ParallelSelfAttention(MegatronModule):
init_method=output_layer_init_method) init_method=output_layer_init_method)
self.output_dropout = torch.nn.Dropout(args.hidden_dropout) self.output_dropout = torch.nn.Dropout(args.hidden_dropout)
def _transpose_for_scores(self, tensor): def _transpose_for_scores(self, tensor):
"""Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
size [b, np, s, hn]. size [b, np, s, hn].
""" """
new_tensor_shape = tensor.size()[:-1] + \ new_tensor_shape = tensor.size()[:-1] + \
(self.num_attention_heads_per_partition, (self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head) self.hidden_size_per_attention_head)
tensor = tensor.view(*new_tensor_shape) tensor = tensor.view(*new_tensor_shape)
return tensor.permute(0, 2, 1, 3) return tensor.permute(0, 2, 1, 3)
def _get_query_key_value(self, hidden_states): def _get_query_key_value(self, hidden_states):
"""Get query, key, and value and transpose to """Get query, key, and value and transpose to
get size [b, np, s, hn]. get size [b, np, s, hn].
...@@ -170,7 +168,6 @@ class ParallelSelfAttention(MegatronModule): ...@@ -170,7 +168,6 @@ class ParallelSelfAttention(MegatronModule):
return query_layer, key_layer, value_layer return query_layer, key_layer, value_layer
def _get_unmasked_attention_scores(self, query_layer, key_layer): def _get_unmasked_attention_scores(self, query_layer, key_layer):
"""Unmasked attention scores with size [b, np, s, s].""" """Unmasked attention scores with size [b, np, s, s]."""
coeff = 1 coeff = 1
...@@ -179,9 +176,8 @@ class ParallelSelfAttention(MegatronModule): ...@@ -179,9 +176,8 @@ class ParallelSelfAttention(MegatronModule):
norm_factor = math.sqrt(coeff * norm_factor = math.sqrt(coeff *
math.sqrt(self.hidden_size_per_attention_head)) math.sqrt(self.hidden_size_per_attention_head))
# Raw attention scores. [b, np, s, s] # Raw attention scores. [b, np, s, s]
return torch.matmul(query_layer/norm_factor, return torch.matmul(query_layer / norm_factor,
key_layer.transpose(-1, -2)/norm_factor) key_layer.transpose(-1, -2) / norm_factor)
def _get_attention_probs(self, attention_scores): def _get_attention_probs(self, attention_scores):
"""Attention probabilies with dropout. The output has """Attention probabilies with dropout. The output has
...@@ -198,7 +194,6 @@ class ParallelSelfAttention(MegatronModule): ...@@ -198,7 +194,6 @@ class ParallelSelfAttention(MegatronModule):
return attention_probs return attention_probs
def _get_attended_context(self, attention_probs, value_layer): def _get_attended_context(self, attention_probs, value_layer):
"""Final attended tesnor and transposed back to [b, s, hp].""" """Final attended tesnor and transposed back to [b, s, hp]."""
# Context layer. # Context layer.
...@@ -207,13 +202,12 @@ class ParallelSelfAttention(MegatronModule): ...@@ -207,13 +202,12 @@ class ParallelSelfAttention(MegatronModule):
# [b, s, np, hn] # [b, s, np, hn]
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + \ new_context_layer_shape = context_layer.size()[:-2] + \
(self.hidden_size_per_partition,) (self.hidden_size_per_partition,)
# [b, s, hp] # [b, s, hp]
context_layer = context_layer.view(*new_context_layer_shape) context_layer = context_layer.view(*new_context_layer_shape)
return context_layer return context_layer
def _get_output(self, context_layer): def _get_output(self, context_layer):
"""Output layer with dropout.""" """Output layer with dropout."""
# Output. [b, s, h] # Output. [b, s, h]
...@@ -222,7 +216,6 @@ class ParallelSelfAttention(MegatronModule): ...@@ -222,7 +216,6 @@ class ParallelSelfAttention(MegatronModule):
return output return output
def forward(self, hidden_states, attention_mask, layer_past=None, def forward(self, hidden_states, attention_mask, layer_past=None,
get_key_value=False): get_key_value=False):
# hidden_states: [b, s, h] # hidden_states: [b, s, h]
...@@ -254,7 +247,7 @@ class ParallelSelfAttention(MegatronModule): ...@@ -254,7 +247,7 @@ class ParallelSelfAttention(MegatronModule):
if layer_past is not None: if layer_past is not None:
attention_mask = attention_mask[ attention_mask = attention_mask[
..., ...,
attention_scores.size(3)-1, attention_scores.size(3) - 1,
:attention_scores.size(3)].unsqueeze(2) :attention_scores.size(3)].unsqueeze(2)
else: else:
attention_mask = attention_mask[ attention_mask = attention_mask[
...@@ -283,13 +276,13 @@ class ParallelSelfAttention(MegatronModule): ...@@ -283,13 +276,13 @@ class ParallelSelfAttention(MegatronModule):
return output return output
class ParallelTransformerLayer(MegatronModule): class ParallelTransformerLayer(MegatronModule):
"""A single transformer layer. """A single transformer layer.
Transformore layer takes input with size [b, s, h] and returns an Transformore layer takes input with size [b, s, h] and returns an
output of the same size. output of the same size.
""" """
def __init__(self, attention_mask_func, mlp_activation_func, def __init__(self, attention_mask_func, mlp_activation_func,
init_method, output_layer_init_method, layer_number): init_method, output_layer_init_method, layer_number):
args = get_args() args = get_args()
...@@ -319,7 +312,6 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -319,7 +312,6 @@ class ParallelTransformerLayer(MegatronModule):
self.mlp = ParallelMLP(mlp_activation_func, init_method, self.mlp = ParallelMLP(mlp_activation_func, init_method,
output_layer_init_method) output_layer_init_method)
def forward(self, hidden_states, attention_mask, layer_past=None, def forward(self, hidden_states, attention_mask, layer_past=None,
get_key_value=False): get_key_value=False):
# hidden_states: [b, s, h] # hidden_states: [b, s, h]
...@@ -375,14 +367,13 @@ class ParallelTransformer(MegatronModule): ...@@ -375,14 +367,13 @@ class ParallelTransformer(MegatronModule):
# Transformer layers. # Transformer layers.
self.layers = torch.nn.ModuleList( self.layers = torch.nn.ModuleList(
[get_layer(i+1) for i in range(args.num_layers)]) [get_layer(i + 1) for i in range(args.num_layers)])
# Final layer norm before output. # Final layer norm before output.
self.final_layernorm = LayerNorm( self.final_layernorm = LayerNorm(
args.hidden_size, args.hidden_size,
eps=args.layernorm_epsilon) eps=args.layernorm_epsilon)
def _checkpointed_forward(self, hidden_states, attention_mask): def _checkpointed_forward(self, hidden_states, attention_mask):
"""Forward method with activation checkpointing.""" """Forward method with activation checkpointing."""
def custom(start, end): def custom(start, end):
...@@ -398,13 +389,12 @@ class ParallelTransformer(MegatronModule): ...@@ -398,13 +389,12 @@ class ParallelTransformer(MegatronModule):
num_layers = len(self.layers) num_layers = len(self.layers)
while l < num_layers: while l < num_layers:
hidden_states = mpu.checkpoint( hidden_states = mpu.checkpoint(
custom(l, l+self.checkpoint_num_layers), custom(l, l + self.checkpoint_num_layers),
hidden_states, attention_mask) hidden_states, attention_mask)
l += self.checkpoint_num_layers l += self.checkpoint_num_layers
return hidden_states return hidden_states
def forward(self, hidden_states, attention_mask, layer_past=None, def forward(self, hidden_states, attention_mask, layer_past=None,
get_key_value=False): get_key_value=False):
......
...@@ -33,6 +33,7 @@ def init_method_normal(sigma): ...@@ -33,6 +33,7 @@ def init_method_normal(sigma):
def scaled_init_method_normal(sigma, num_layers): def scaled_init_method_normal(sigma, num_layers):
"""Init method based on N(0, sigma/sqrt(2*num_layers).""" """Init method based on N(0, sigma/sqrt(2*num_layers)."""
std = sigma / math.sqrt(2.0 * num_layers) std = sigma / math.sqrt(2.0 * num_layers)
def init_(tensor): def init_(tensor):
return torch.nn.init.normal_(tensor, mean=0.0, std=std) return torch.nn.init.normal_(tensor, mean=0.0, std=std)
...@@ -54,6 +55,7 @@ def gelu_impl(x): ...@@ -54,6 +55,7 @@ def gelu_impl(x):
return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
(1.0 + 0.044715 * x * x))) (1.0 + 0.044715 * x * x)))
def gelu(x): def gelu(x):
return gelu_impl(x) return gelu_impl(x)
......
...@@ -21,11 +21,9 @@ import torch ...@@ -21,11 +21,9 @@ import torch
class MegatronModule(torch.nn.Module): class MegatronModule(torch.nn.Module):
"""Megatron specific extentions of torch Module.""" """Megatron specific extentions of torch Module."""
def __init__(self): def __init__(self):
super(MegatronModule, self).__init__() super(MegatronModule, self).__init__()
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
"""Use this function to override the state dict for """Use this function to override the state dict for
......
...@@ -72,7 +72,6 @@ class _VocabParallelCrossEntropy(torch.autograd.Function): ...@@ -72,7 +72,6 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
op=torch.distributed.ReduceOp.SUM, op=torch.distributed.ReduceOp.SUM,
group=get_model_parallel_group()) group=get_model_parallel_group())
# Loss = log(sum(exp(logits))) - predicted-logit. # Loss = log(sum(exp(logits))) - predicted-logit.
loss = torch.log(sum_exp_logits) - predicted_logits loss = torch.log(sum_exp_logits) - predicted_logits
......
...@@ -89,6 +89,7 @@ class VocabParallelEmbedding(torch.nn.Module): ...@@ -89,6 +89,7 @@ class VocabParallelEmbedding(torch.nn.Module):
embedding_dim: size of hidden state. embedding_dim: size of hidden state.
init_method: method to initialize weights. init_method: method to initialize weights.
""" """
def __init__(self, num_embeddings, embedding_dim, def __init__(self, num_embeddings, embedding_dim,
init_method=init.xavier_normal_): init_method=init.xavier_normal_):
super(VocabParallelEmbedding, self).__init__() super(VocabParallelEmbedding, self).__init__()
...@@ -108,7 +109,7 @@ class VocabParallelEmbedding(torch.nn.Module): ...@@ -108,7 +109,7 @@ class VocabParallelEmbedding(torch.nn.Module):
self.num_embeddings, get_model_parallel_rank(), self.num_embeddings, get_model_parallel_rank(),
get_model_parallel_world_size()) get_model_parallel_world_size())
self.num_embeddings_per_partition = self.vocab_end_index - \ self.num_embeddings_per_partition = self.vocab_end_index - \
self.vocab_start_index self.vocab_start_index
# Allocate weights. # Allocate weights.
self.weight = Parameter(torch.Tensor(self.num_embeddings_per_partition, self.weight = Parameter(torch.Tensor(self.num_embeddings_per_partition,
...@@ -147,6 +148,7 @@ class ParallelEmbedding(torch.nn.Module): ...@@ -147,6 +148,7 @@ class ParallelEmbedding(torch.nn.Module):
embedding_dim: size of hidden state. embedding_dim: size of hidden state.
init_method: method to initialize weights. init_method: method to initialize weights.
""" """
def __init__(self, num_embeddings, embedding_dim, def __init__(self, num_embeddings, embedding_dim,
init_method=init.xavier_normal_, init_method=init.xavier_normal_,
keep_master_weight_for_test=False): keep_master_weight_for_test=False):
...@@ -205,6 +207,7 @@ class ColumnParallelLinear(torch.nn.Module): ...@@ -205,6 +207,7 @@ class ColumnParallelLinear(torch.nn.Module):
set to False. It returns the master weights set to False. It returns the master weights
used for initialization. used for initialization.
""" """
def __init__(self, input_size, output_size, bias=True, gather_output=True, def __init__(self, input_size, output_size, bias=True, gather_output=True,
init_method=init.xavier_normal_, stride=1, init_method=init.xavier_normal_, stride=1,
keep_master_weight_for_test=False): keep_master_weight_for_test=False):
...@@ -279,6 +282,7 @@ class RowParallelLinear(torch.nn.Module): ...@@ -279,6 +282,7 @@ class RowParallelLinear(torch.nn.Module):
set to False. It returns the master weights set to False. It returns the master weights
used for initialization. used for initialization.
""" """
def __init__(self, input_size, output_size, bias=True, def __init__(self, input_size, output_size, bias=True,
input_is_parallel=False, input_is_parallel=False,
init_method=init.xavier_normal_, stride=1, init_method=init.xavier_normal_, stride=1,
...@@ -327,4 +331,3 @@ class RowParallelLinear(torch.nn.Module): ...@@ -327,4 +331,3 @@ class RowParallelLinear(torch.nn.Module):
else: else:
output = output_ output = output_
return output return output
...@@ -131,11 +131,14 @@ class _GatherFromModelParallelRegion(torch.autograd.Function): ...@@ -131,11 +131,14 @@ class _GatherFromModelParallelRegion(torch.autograd.Function):
def copy_to_model_parallel_region(input_): def copy_to_model_parallel_region(input_):
return _CopyToModelParallelRegion.apply(input_) return _CopyToModelParallelRegion.apply(input_)
def reduce_from_model_parallel_region(input_): def reduce_from_model_parallel_region(input_):
return _ReduceFromModelParallelRegion.apply(input_) return _ReduceFromModelParallelRegion.apply(input_)
def scatter_to_model_parallel_region(input_): def scatter_to_model_parallel_region(input_):
return _ScatterToModelParallelRegion.apply(input_) return _ScatterToModelParallelRegion.apply(input_)
def gather_from_model_parallel_region(input_): def gather_from_model_parallel_region(input_):
return _GatherFromModelParallelRegion.apply(input_) return _GatherFromModelParallelRegion.apply(input_)
...@@ -73,6 +73,7 @@ class CudaRNGStatesTracker: ...@@ -73,6 +73,7 @@ class CudaRNGStatesTracker:
rng state, we can perform operations and return to our starting rng state, we can perform operations and return to our starting
cuda state. cuda state.
""" """
def __init__(self): def __init__(self):
# Map from a string name to the cuda rng state. # Map from a string name to the cuda rng state.
self.states_ = {} self.states_ = {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment