Merge master into realm-mlm

5e56e563 · Neel Kant · 6c0a5bd8 · 569b3dab · 5e56e563 · 5e56e563
Commit 5e56e563 authored Apr 28, 2020 by Neel Kant
20 changed files
--- a/megatron/deprecated_data_utils/tokenization.py
+++ b/megatron/deprecated_data_utils/tokenization.py
--- a/megatron/deprecated_data_utils/tokenization_gpt2.py
+++ b/megatron/deprecated_data_utils/tokenization_gpt2.py
@@ -27,7 +27,8 @@ try:
    from functools import lru_cache
 except ImportError:
    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    # because honestly I don't want to support a byte-level unicode BPE
+    # tokenizer on python 2 right now.
    def lru_cache():
        return lambda func: func
@@ -48,6 +49,7 @@ VOCAB_NAME = 'vocab.json'
 MERGES_NAME = 'merges.txt'
 SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 @lru_cache()
 def bytes_to_unicode():
    """
@@ -60,17 +62,19 @@ def bytes_to_unicode():
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
+        list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
-            cs.append(2**8+n)
+            cs.append(2**8 + n)
            n += 1
    cs = [_chr(n) for n in cs]
    return dict(zip(bs, cs))
 def get_pairs(word):
    """Return set of symbol pairs in a word.
@@ -83,6 +87,7 @@ def get_pairs(word):
        prev_char = char
    return pairs
 class GPT2Tokenizer(object):
    """
    GPT-2 BPE tokenizer. Peculiarities:
@@ -138,23 +143,31 @@ class GPT2Tokenizer(object):
            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
        else:
            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
        return tokenizer
-    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 special_tokens=None, max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
-        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
+        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        # Should haved added re.IGNORECASE so BPE merges can happen for
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.special_tokens = {}
        self.special_tokens_decoder = {}
@@ -172,8 +185,9 @@ class GPT2Tokenizer(object):
            self.special_tokens = {}
            self.special_tokens_decoder = {}
            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens = dict((tok, len(self.encoder) + i)
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
        logger.info("Special tokens {}".format(self.special_tokens))
    def bpe(self, token):
@@ -186,7 +200,7 @@ class GPT2Tokenizer(object):
            return token
        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
@@ -197,12 +211,12 @@ class GPT2Tokenizer(object):
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
-                except:
+                except BaseException:
                    new_word.extend(word[i:])
                    break
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first+second)
+                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
@@ -245,7 +259,8 @@ class GPT2Tokenizer(object):
            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+                " sequence through the model will result in indexing errors".format(
+                    len(ids), self.max_len)
            )
        return ids

--- a/megatron/deprecated_data_utils/wordpiece.py
+++ b/megatron/deprecated_data_utils/wordpiece.py
@@ -99,19 +99,19 @@ class BertTokenizer(object):
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
-          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                never_split=never_split)
+                                                  never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
    def tokenize(self, text):
        if self.do_basic_tokenize:
-          split_tokens = []
+            split_tokens = []
-          for token in self.basic_tokenizer.tokenize(text):
+            for token in self.basic_tokenizer.tokenize(text):
-              for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                  split_tokens.append(sub_token)
+                    split_tokens.append(sub_token)
        else:
-          split_tokens = self.wordpiece_tokenizer.tokenize(text)
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens
    def convert_tokens_to_ids(self, tokens):
@@ -123,7 +123,8 @@ class BertTokenizer(object):
            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+                " sequence through BERT will result in indexing errors".format(
+                    len(ids), self.max_len)
            )
        return ids

--- a/megatron/fp16/__init__.py
+++ b/megatron/fp16/__init__.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/megatron/fp16/fp16.py
+++ b/megatron/fp16/fp16.py
--- a/megatron/fp16/fp16util.py
+++ b/megatron/fp16/fp16util.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@ import torch.nn as nn
 from torch.autograd import Variable
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
 from megatron import mpu
@@ -102,6 +105,7 @@ class FP16Model(nn.Module):
 def backwards_debug_hook(grad):
    raise RuntimeError("master_params recieved a gradient in the backward pass!")
 def prep_param_lists(model, flat_master=False):
    """
    Creates a list of FP32 master parameters for a given model, as in
@@ -131,9 +135,9 @@ def prep_param_lists(model, flat_master=False):
            # flatten_dense_tensors returns a contiguous flat array.
            # http://pytorch.org/docs/master/_modules/torch/_utils.html
            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except:
+        except BaseException:
            print("Error in prep_param_lists:  model may contain a mixture of parameters "
-                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
+                  "of different types.  Use flat_master=False, or use F16_Optimizer.")
            raise
        master_params = torch.nn.Parameter(master_params)
        master_params.requires_grad = True
@@ -150,7 +154,7 @@ def prep_param_lists(model, flat_master=False):
 def model_grads_to_master_grads(model_params, master_params, flat_master=False):
    """
-    Copy model gradients to master gradients.  
+    Copy model gradients to master gradients.
    Args:
        model_params:  List of model parameters created by :func:`prep_param_lists`.
@@ -165,9 +169,15 @@ def model_grads_to_master_grads(model_params, master_params, flat_master=False):
            if model.grad is not None:
                if master.grad is None:
                    master.grad = Variable(master.data.new(*master.data.size()))
-                master.grad.data.copy_(model.grad.data)
            else:
                master.grad = None
+        model_grads = [p.grad for p in model_params if p.grad is not None]
+        master_grads = [p.grad for p in master_params if p.grad is not None]
+        _overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             _overflow_buf,
+                             [model_grads, master_grads],
+                             1.0)
 def master_params_to_model_params(model_params, master_params, flat_master=False):
@@ -179,7 +189,7 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
    """
    if flat_master:
-        for model, master in zip(model_params, 
+        for model, master in zip(model_params,
                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
            model.data.copy_(master)
    else:
@@ -188,17 +198,19 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
 # Backward compatibility fixes
 def to_python_float(t):
    if hasattr(t, 'item'):
        return t.item()
    else:
        return t[0]
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])
 clip_grad_norm = mpu.clip_grad_norm
-#elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
 #    clip_grad_norm = torch.nn.utils.clip_grad_norm
-#else:
+# else:
 #    clip_grad_norm = torch.nn.utils.clip_grad_norm_
--- a/megatron/fp16/loss_scaler.py
+++ b/megatron/fp16/loss_scaler.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,21 +14,28 @@
 # limitations under the License.
 import torch
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
 from megatron import mpu
 # item() is a recent addition, so this helps with backward compatibility.
 def to_python_float(t):
    if hasattr(t, 'item'):
        return t.item()
    else:
        return t[0]
 class LossScaler:
    """
    Class that manages a static loss scale.  This class is intended to interact with
    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
    :class:`FP16_Optimizer`'s constructor.
    Args:
@@ -54,16 +61,22 @@ class LossScaler:
        return self.cur_scale
    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
+        _overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             _overflow_buf,
+                             [grad_in, grad_in],
+                             self.loss_scale)
+        return grad_in
    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss*self.loss_scale
+        scaled_loss = loss * self.loss_scale
        scaled_loss.backward(retain_graph=retain_graph)
 class DynamicLossScaler:
    """
    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
    operates, because the default options can be changed using the
    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
@@ -71,18 +84,18 @@ class DynamicLossScaler:
    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
    occurred.
    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.  
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
    If a certain number of iterations occur without overflowing gradients detected,
    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
    always using the highest loss scale possible without incurring overflow.
    Args:
        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
    """
@@ -122,12 +135,12 @@ class DynamicLossScaler:
        overflow = overflow_gpu[0].item()
        return bool(overflow)
    # `x` is a torch.Tensor
    def _has_inf_or_nan(x):
        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if 
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x 
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
            # (which is true for some recent version of pytorch).
            cpu_sum = float(x.float().sum())
            # More efficient version that can be used if .sum() returns a Python scalar
@@ -158,7 +171,7 @@ class DynamicLossScaler:
        if overflow:
            # self.cur_scale /= self.scale_factor
            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale)
+                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
            else:
                self.cur_hysteresis -= 1
            self.last_overflow_iter = self.cur_iter
@@ -176,13 +189,19 @@ class DynamicLossScaler:
        return self.cur_scale
    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
+        _overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             _overflow_buf,
+                             [grad_in, grad_in],
+                             self.loss_scale)
+        return grad_in
    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss*self.loss_scale
+        scaled_loss = loss * self.loss_scale
        scaled_loss.backward(retain_graph=retain_graph)
-##############################################################        
+##############################################################
 # Example usage below here -- assuming it's in a separate file
 ##############################################################
 """
@@ -218,10 +237,10 @@ if __name__ == "__main__":
        # Run backprop
        optimizer.zero_grad()
        loss.backward()
        # Check for overflow
        has_overflow = DynamicLossScaler.has_overflow(parameters)
        # If no overflow, unscale grad and update as usual
        if not has_overflow:
            for param in parameters:

--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,22 +61,26 @@ def get_timers():
    return _GLOBAL_TIMERS
-def set_global_variables(extra_args_provider=None, args_defaults={}):
+def set_global_variables(extra_args_provider=None, args_defaults={},
+                         ignore_unknown_args=False):
    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
    args = _parse_args(extra_args_provider=extra_args_provider,
-                       defaults=args_defaults)
+                       defaults=args_defaults,
+                       ignore_unknown_args=ignore_unknown_args)
    _ = _build_tokenizer(args)
    _set_tensorboard_writer(args)
    _set_adlr_autoresume(args)
    _set_timers()
-def _parse_args(extra_args_provider=None, defaults={}):
+def _parse_args(extra_args_provider=None, defaults={},
+                ignore_unknown_args=False):
    """Parse entire arguments."""
    global _GLOBAL_ARGS
    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
    _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,
-                              defaults=defaults)
+                              defaults=defaults,
+                              ignore_unknown_args=ignore_unknown_args)
    return _GLOBAL_ARGS
@@ -124,7 +128,7 @@ def _set_adlr_autoresume(args):
        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
        try:
            from userlib.auto_resume import AutoResume
-        except:
+        except BaseException:
            print('ADLR autoresume is not available, exiting ...')
            sys.exit()

--- a/megatron/initialize.py
+++ b/megatron/initialize.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,8 @@ from megatron import mpu
 from megatron.global_vars import set_global_variables
-def initialize_megatron(extra_args_provider=None, args_defaults={}):
+def initialize_megatron(extra_args_provider=None, args_defaults={},
+                        ignore_unknown_args=False):
    """Set global variables, initialize distributed, and
    set autoresume and random seeds."""
    # Make sure cuda is available.
@@ -37,7 +38,8 @@ def initialize_megatron(extra_args_provider=None, args_defaults={}):
    # Parse args, build tokenizer, and set adlr-autoresume,
    # tensorboard-writer, and timers.
    set_global_variables(extra_args_provider=extra_args_provider,
-                         args_defaults=args_defaults)
+                         args_defaults=args_defaults,
+                         ignore_unknown_args=ignore_unknown_args)
    # Pytorch distributed.
    _initialize_distributed()

--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,7 +48,6 @@ class AnnealingLR(object):
        print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
    def get_lr(self):
        """Learning rate decay functions from:
              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
@@ -71,7 +70,6 @@ class AnnealingLR(object):
            lr = self.start_lr
        return max(lr, self.min_lr)
    def step(self, step_num=None):
        """Set lr for all parameters groups."""
        if step_num is None:
@@ -81,7 +79,6 @@ class AnnealingLR(object):
        for group in self.optimizer.param_groups:
            group['lr'] = new_lr
    def state_dict(self):
        state_dict = {
            'start_lr': self.start_lr,
@@ -93,7 +90,6 @@ class AnnealingLR(object):
        }
        return state_dict
    def _check_and_set(self, cls_value, sd_value, name):
        """Auxiliary function for checking the values in the checkpoint and
        setting them."""
@@ -108,7 +104,6 @@ class AnnealingLR(object):
                                                                  name))
        return sd_value
    def load_state_dict(self, sd):
        self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],

--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,16 +22,15 @@ import torch
 import torch.nn.functional as F
 from megatron import get_args
+from megatron.model.language_model import parallel_lm_logits
+from megatron.model.language_model import get_language_model
+from megatron.model.transformer import LayerNorm
+from megatron.model.utils import openai_gelu
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
 from megatron.module import MegatronModule
-from .language_model import parallel_lm_logits
-from .language_model import get_language_model
-from .transformer import LayerNorm
-from .utils import gelu
-from .utils import get_linear_layer
-from .utils import init_method_normal
-from .utils import scaled_init_method_normal
 def bert_attention_mask_func(attention_scores, attention_mask):
    attention_scores = attention_scores + attention_mask
@@ -70,7 +69,6 @@ def bert_position_ids(token_ids):
    return position_ids
 class BertLMHead(MegatronModule):
    """Masked LM head for Bert
@@ -81,11 +79,14 @@ class BertLMHead(MegatronModule):
        layernorm_epsilon: tolerance for layer norm divisions
        parallel_output: whether output logits being distributed or not.
    """
    def __init__(self, mpu_vocab_size, hidden_size, init_method,
                 layernorm_epsilon, parallel_output):
        super(BertLMHead, self).__init__()
+        args = get_args()
        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
        self.bias.model_parallel = True
        self.bias.partition_dim = 0
@@ -94,11 +95,13 @@ class BertLMHead(MegatronModule):
        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+        self.gelu = torch.nn.functional.gelu
+        if args.openai_gelu:
+            self.gelu = openai_gelu
    def forward(self, hidden_states, word_embeddings_weight):
        hidden_states = self.dense(hidden_states)
-        hidden_states = gelu(hidden_states)
+        hidden_states = self.gelu(hidden_states)
        hidden_states = self.layernorm(hidden_states)
        output = parallel_lm_logits(hidden_states,
                                    word_embeddings_weight,
@@ -107,7 +110,6 @@ class BertLMHead(MegatronModule):
        return output
 class BertModel(MegatronModule):
    """Bert Language model."""
@@ -184,7 +186,6 @@ class BertModel(MegatronModule):
        return lm_logits, None
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
        """For easy load when model is combined with other heads,
@@ -206,7 +207,6 @@ class BertModel(MegatronModule):
                = self.ict_head.state_dict(destination, prefix, keep_vars)
        return state_dict_
    def load_state_dict(self, state_dict, strict=True):
        """Customized load."""
@@ -224,8 +224,6 @@ class BertModel(MegatronModule):
 class REALMBertModel(MegatronModule):
-    # TODO: load BertModel checkpoint
    def __init__(self, retriever):
        super(REALMBertModel, self).__init__()
        bert_args = dict(

--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,7 +53,6 @@ class Classification(MegatronModule):
                                                    init_method)
        self._classification_head_key = 'classification_head'
    def forward(self, input_ids, attention_mask, tokentype_ids):
        extended_attention_mask = bert_extended_attention_mask(
@@ -74,7 +73,6 @@ class Classification(MegatronModule):
        return classification_logits
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
        """For easy load when model is combined with other heads,
@@ -89,7 +87,6 @@ class Classification(MegatronModule):
                destination, prefix, keep_vars)
        return state_dict_
    def load_state_dict(self, state_dict, strict=True):
        """Customized load."""

--- a/megatron/model/distributed.py
+++ b/megatron/model/distributed.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,10 +31,6 @@ class DistributedDataParallel(MegatronModule):
        self.module = module
        self.data_parallel_group = mpu.get_data_parallel_group()
-        src_rank = mpu.get_model_parallel_rank()
-        for p in self.module.parameters():
-            if torch.is_tensor(p):
-                dist.broadcast(p, src_rank, group=self.data_parallel_group)
        def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
            if(self.needs_reduction):
@@ -71,8 +67,8 @@ class DistributedDataParallel(MegatronModule):
            def allreduce_hook(*unused):
                Variable._execution_engine.queue_callback(allreduce_params)
        #    handle = param.register_hook(allreduce_hook)
-            #self.hooks.append(allreduce_hook)
+            # self.hooks.append(allreduce_hook)
-            #self.hook_handles.append(handle)
+            # self.hook_handles.append(handle)
        self.allreduce_params = allreduce_params
    def forward(self, *inputs, **kwargs):
@@ -114,4 +110,3 @@ class DistributedDataParallel(MegatronModule):
        super(DistributedDataParallel, self).train(mode)
        self.module.train(mode)
    '''
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,8 +27,7 @@ from .utils import scaled_init_method_normal
 def gpt2_attention_mask_func(attention_scores, ltor_mask):
-    attention_scores = torch.mul(attention_scores, ltor_mask) - \
+    attention_scores.masked_fill_(ltor_mask, -10000.0)
-                       10000.0 * (1.0 - ltor_mask)
    return attention_scores
@@ -49,7 +48,6 @@ class GPT2Model(MegatronModule):
            scaled_init_method=scaled_init_method_normal(args.init_method_std,
                                                         args.num_layers))
    def forward(self, input_ids, position_ids, attention_mask,
                tokentype_ids=None, layer_past=None, get_key_value=False,
                forward_method_parallel_output=None):
@@ -79,7 +77,6 @@ class GPT2Model(MegatronModule):
        return output
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
@@ -89,7 +86,6 @@ class GPT2Model(MegatronModule):
                destination, prefix, keep_vars)
        return state_dict_
    def load_state_dict(self, state_dict, strict=True):
        """Customized load."""

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,9 +21,8 @@ import torch.nn.functional as F
 from megatron import get_args
 from megatron import mpu
 from megatron.module import MegatronModule
 from megatron.model.transformer import ParallelTransformer
-from megatron.model.utils import gelu
+from megatron.model.utils import openai_gelu
 from megatron.model.utils import get_linear_layer
@@ -47,7 +46,13 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
 def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
                       init_method, scaled_init_method, max_pos_embeds=None):
    """Build language model and return along with the key to save."""
+    args = get_args()
+    # Use torch gelu unless otherwise forced.
+    gelu = F.gelu
+    if args.openai_gelu:
+        gelu = openai_gelu
    # Language model.
    language_model = TransformerLanguageModel(
        attention_mask_func=attention_mask_func,
@@ -63,7 +68,6 @@ def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
    return language_model, language_model_key
 class Pooler(MegatronModule):
    """Pooler layer.
@@ -75,11 +79,11 @@ class Pooler(MegatronModule):
        init_method: weight initialization method for the linear layer.
            bias is set to zero.
    """
    def __init__(self, hidden_size, init_method):
        super(Pooler, self).__init__()
        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
    def forward(self, hidden_states, sequence_index=0):
        # hidden_states: [b, s, h]
        # sequence_index: index of the token to pool.
@@ -102,6 +106,7 @@ class Embedding(MegatronModule):
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    """
    def __init__(self,
                 hidden_size,
                 vocab_size,
@@ -143,7 +148,6 @@ class Embedding(MegatronModule):
        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
    def add_tokentype_embeddings(self, num_tokentypes):
        """Add token-type embedding. This function is provided so we can add
        token-type embeddings in case the pretrained model does not have it.
@@ -160,7 +164,6 @@ class Embedding(MegatronModule):
        # Initialize the token-type embeddings.
        self.init_method(self.tokentype_embeddings.weight)
    def forward(self, input_ids, position_ids, tokentype_ids=None):
        # Embeddings.
        words_embeddings = self.word_embeddings(input_ids)
@@ -177,7 +180,6 @@ class Embedding(MegatronModule):
        return embeddings
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
        """For easy load."""
@@ -195,7 +197,6 @@ class Embedding(MegatronModule):
        return state_dict_
    def load_state_dict(self, state_dict, strict=True):
        """Customized load."""
@@ -224,7 +225,7 @@ class Embedding(MegatronModule):
        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
        # Tokentype embedding.
-        if  self.num_tokentypes > 0:
+        if self.num_tokentypes > 0:
            state_dict_ = {}
            if self._tokentype_embeddings_key in state_dict:
                state_dict_ = state_dict[self._tokentype_embeddings_key]
@@ -242,7 +243,6 @@ class Embedding(MegatronModule):
                      'checkpoint but could not find it', flush=True)
 class TransformerLanguageModel(MegatronModule):
    """Transformer language model.
@@ -261,6 +261,7 @@ class TransformerLanguageModel(MegatronModule):
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    """
    def __init__(self,
                 attention_mask_func,
                 mlp_activation_func,
@@ -298,7 +299,6 @@ class TransformerLanguageModel(MegatronModule):
            self.pooler = Pooler(self.hidden_size, self.init_method)
            self._pooler_key = 'pooler'
    def forward(self, input_ids, position_ids, attention_mask,
                tokentype_ids=None, layer_past=None, get_key_value=False,
                pooling_sequence_index=0):
@@ -320,7 +320,6 @@ class TransformerLanguageModel(MegatronModule):
        return transformer_output
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
        """For easy load."""
@@ -339,7 +338,6 @@ class TransformerLanguageModel(MegatronModule):
        return state_dict_
    def load_state_dict(self, state_dict, strict=True):
        """Customized load."""

--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -51,7 +51,6 @@ class MultipleChoice(MegatronModule):
                                                 init_method)
        self._multichoice_head_key = 'multichoice_head'
    def forward(self, input_ids, attention_mask, tokentype_ids):
        # [batch, choices, sequence] --> [batch * choices, sequence] -->
@@ -86,7 +85,6 @@ class MultipleChoice(MegatronModule):
        return multichoice_logits
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
        """For easy load when model is combined with other heads,
@@ -101,7 +99,6 @@ class MultipleChoice(MegatronModule):
                destination, prefix, keep_vars)
        return state_dict_
    def load_state_dict(self, state_dict, strict=True):
        """Customized load."""

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@ def init_method_normal(sigma):
 def scaled_init_method_normal(sigma, num_layers):
    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
    std = sigma / math.sqrt(2.0 * num_layers)
    def init_(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
@@ -53,8 +54,7 @@ def gelu_impl(x):
    """OpenAI's gelu implementation."""
    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
                                       (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
-def gelu(x):
    return gelu_impl(x)

--- a/megatron/module.py
+++ b/megatron/module.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,11 +21,9 @@ import torch
 class MegatronModule(torch.nn.Module):
    """Megatron specific extentions of torch Module."""
    def __init__(self):
        super(MegatronModule, self).__init__()
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
        """Use this function to override the state dict for