Merge staging into ict-stable

9350ee08 · Neel Kant · 9238c57a · 63262827 · 9350ee08 · 9350ee08
Commit 9350ee08 authored Apr 03, 2020 by Neel Kant
20 changed files
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron global variables."""
+import os
+import sys
+import time
+import torch
+from megatron.tokenizer import build_tokenizer
+from .arguments import parse_args
+_GLOBAL_ARGS = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_ADLR_AUTORESUME = None
+_GLOBAL_TIMERS = None
+def get_args():
+    """Return arguments."""
+    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+    return _GLOBAL_ARGS
+def get_tokenizer():
+    """Return tokenizer."""
+    _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    return _GLOBAL_TOKENIZER
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+def get_adlr_autoresume():
+    """ADLR autoresume object. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ADLR_AUTORESUME
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+def set_global_variables(extra_args_provider=None, args_defaults={}):
+    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
+    args = _parse_args(extra_args_provider=extra_args_provider,
+                       defaults=args_defaults)
+    _build_tokenizer(args)
+    _set_tensorboard_writer(args)
+    _set_adlr_autoresume(args)
+    _set_timers()
+def _parse_args(extra_args_provider=None, defaults={}):
+    """Parse entire arguments."""
+    global _GLOBAL_ARGS
+    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
+    _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,
+                              defaults=defaults)
+    return _GLOBAL_ARGS
+def _build_tokenizer(args):
+    """Initialize tokenizer."""
+    global _GLOBAL_TOKENIZER
+    _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    _GLOBAL_TOKENIZER = build_tokenizer(args)
+def _set_tensorboard_writer(args):
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
+                                   'tensorboard writer')
+    if hasattr(args, 'tensorboard_dir') and \
+       args.tensorboard_dir and args.rank == 0:
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            print('> setting tensorboard ...')
+            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
+                log_dir=args.tensorboard_dir)
+        except ModuleNotFoundError:
+            print('WARNING: TensorBoard writing requested but is not '
+                  'available (are you using PyTorch 1.1.0 or later?), '
+                  'no TensorBoard logs will be written.', flush=True)
+def _set_adlr_autoresume(args):
+    """Initialize ADLR autoresume."""
+    global _GLOBAL_ADLR_AUTORESUME
+    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
+    if args.adlr_autoresume:
+        if args.rank == 0:
+            print('enabling autoresume ...', flush=True)
+        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
+        try:
+            from userlib.auto_resume import AutoResume
+        except:
+            print('ADLR autoresume is not available, exiting ...')
+            sys.exit()
+        _GLOBAL_ADLR_AUTORESUME = AutoResume
+def _set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers()
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
+class _Timer:
+    """Timer."""
+    def __init__(self, name):
+        self.name_ = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+    def start(self):
+        """Start the timer."""
+        assert not self.started_, 'timer has already been started'
+        torch.cuda.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+    def stop(self):
+        """Stop the timer."""
+        assert self.started_, 'timer is not started'
+        torch.cuda.synchronize()
+        self.elapsed_ += (time.time() - self.start_time)
+        self.started_ = False
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+class Timers:
+    """Group of timers."""
+    def __init__(self):
+        self.timers = {}
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
+        """Write timers to a tensorboard writer"""
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+            writer.add_scalar(name + '_time', value, iteration)
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == 0:
+                print(string, flush=True)
+        else:
+            print(string, flush=True)
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron initialization."""
+import random
+import os
+import numpy as np
+import torch
+from megatron import get_adlr_autoresume
+from megatron import get_args
+from megatron import get_tensorboard_writer
+from megatron import mpu
+from megatron.global_vars import set_global_variables
+def initialize_megatron(extra_args_provider=None, args_defaults={}):
+    """Set global variables, initialize distributed, and
+    set autoresume and random seeds."""
+    # Make sure cuda is available.
+    assert torch.cuda.is_available(), 'Megatron requires CUDA.'
+    # Parse args, build tokenizer, and set adlr-autoresume,
+    # tensorboard-writer, and timers.
+    set_global_variables(extra_args_provider=extra_args_provider,
+                         args_defaults=args_defaults)
+    # Pytorch distributed.
+    _initialize_distributed()
+    # Autoresume.
+    _init_autoresume()
+    # Random seeds for reproducibility.
+    args = get_args()
+    if args.rank == 0:
+        print('> setting random seeds to {} ...'.format(args.seed))
+    _set_random_seed(args.seed)
+    # Write arguments to tensorboard.
+    _write_args_to_tensorboard()
+def _initialize_distributed():
+    """Initialize torch.distributed and mpu."""
+    args = get_args()
+    if torch.distributed.is_initialized():
+        if args.rank == 0:
+            print('torch distributed is already initialized, '
+                  'skipping initialization ...', flush=True)
+        args.rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+        device = torch.cuda.current_device()
+        local_rank = args.rank % torch.cuda.device_count()
+        assert local_rank == device, \
+            'expected local-rank to be the same as rank % device-count.'
+    else:
+        if args.rank == 0:
+            print('> initializing torch distributed ...', flush=True)
+        # Manually set the device ids.
+        device = args.rank % torch.cuda.device_count()
+        if args.local_rank is not None:
+            assert args.local_rank == device, \
+                'expected local-rank to be the same as rank % device-count.'
+        else:
+            args.local_rank = device
+        torch.cuda.set_device(device)
+        # Call the init process
+        init_method = 'tcp://'
+        master_ip = os.getenv('MASTER_ADDR', 'localhost')
+        master_port = os.getenv('MASTER_PORT', '6000')
+        init_method += master_ip + ':' + master_port
+        torch.distributed.init_process_group(
+            backend=args.distributed_backend,
+            world_size=args.world_size, rank=args.rank,
+            init_method=init_method)
+    # Set the model-parallel / data-parallel communicators.
+    mpu.initialize_model_parallel(args.model_parallel_size)
+def _init_autoresume():
+    """Set autoresume start time."""
+    autoresume = get_adlr_autoresume()
+    if autoresume:
+        torch.distributed.barrier()
+        autoresume.init()
+        torch.distributed.barrier()
+def _set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
+def _write_args_to_tensorboard():
+    """Write arguments to tensorboard."""
+    args = get_args()
+    writer = get_tensorboard_writer()
+    if writer:
+        for arg in vars(args):
+            writer.add_text(arg, str(getattr(args, arg)))
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -12,59 +12,68 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch DataLoader for TFRecords"""
-import torch
+"""Learning rate decay functions."""
-from torch.optim.lr_scheduler import _LRScheduler
-import math
-from megatron.utils import print_rank_0
+import math
+from megatron import print_rank_0
-class AnnealingLR(_LRScheduler):
-    """Anneals the learning rate"""
-    DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None']
+class AnnealingLR(object):
+    """Anneals the learning rate."""
-    def __init__(self, optimizer, start_lr, warmup_iter, num_iters,
+    def __init__(self, optimizer, start_lr,
-                 decay_style=None, last_iter=-1, min_lr=0.0,
+                 warmup_iter, total_iters,
+                 decay_style, last_iter, min_lr=0.0,
                 use_checkpoint_lr_scheduler=True,
                 override_lr_scheduler=False):
+        # Class values.
        self.optimizer = optimizer
        self.start_lr = start_lr
        self.min_lr = min_lr
        self.warmup_iter = warmup_iter
-        self.num_iters = last_iter + 1
+        self.num_iters = last_iter
-        self.end_iter = num_iters
+        self.end_iter = total_iters
-        self.decay_style = decay_style.lower() if isinstance(decay_style, str) \
+        assert self.end_iter > 0
-                           else None
+        self.decay_style = decay_style
        self.override_lr_scheduler = override_lr_scheduler
        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
        if self.override_lr_scheduler:
            assert not self.use_checkpoint_lr_scheduler, 'both override and '\
                'use-checkpoint are set.'
+        # Set the learning rate
        self.step(self.num_iters)
-        if torch.distributed.get_rank() == 0:
-            print('learning rate decaying', decay_style)
+        print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
    def get_lr(self):
-        # https://openreview.net/pdf?id=BJYwwY9ll pg. 4
+        """Learning rate decay functions from:
+              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
        num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
+        # Warmup.
        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
            return float(self.start_lr) * num_iters_ / self.warmup_iter
+        num_iters_ = num_iters_ - self.warmup_iter
+        if self.decay_style == 'linear':
+            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+        elif self.decay_style == 'cosine':
+            lr = self.start_lr / 2.0 * (math.cos(
+                math.pi * num_iters_ / self.end_iter) + 1)
+        elif self.decay_style == 'exponential':
+            # exp(-0.693) = 1/2
+            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
        else:
-            if self.decay_style == self.DECAY_STYLES[0]:
+            lr = self.start_lr
-                lr = self.start_lr * ((self.end_iter - (num_iters_ - self.warmup_iter)) / self.end_iter)
+        return max(lr, self.min_lr)
-            elif self.decay_style == self.DECAY_STYLES[1]:
-                lr = self.start_lr / 2.0 * (math.cos(math.pi * (num_iters_ - self.warmup_iter) / self.end_iter) + 1)
-            elif self.decay_style == self.DECAY_STYLES[2]:
-                # exp(-0.693) = 1/2
-                lr = self.start_lr * math.exp(-0.693 * (num_iters_ - self.warmup_iter) / self.end_iter)
-            else:
-                lr = self.start_lr
-            return max(lr, self.min_lr)
    def step(self, step_num=None):
+        """Set lr for all parameters groups."""
        if step_num is None:
            step_num = self.num_iters + 1
        self.num_iters = step_num
@@ -72,42 +81,46 @@ class AnnealingLR(_LRScheduler):
        for group in self.optimizer.param_groups:
            group['lr'] = new_lr
    def state_dict(self):
-        sd = {
+        state_dict = {
-                'start_lr': self.start_lr,
+            'start_lr': self.start_lr,
-                'warmup_iter': self.warmup_iter,
+            'warmup_iter': self.warmup_iter,
-                'num_iters': self.num_iters,
+            'num_iters': self.num_iters,
-                'decay_style': self.decay_style,
+            'decay_style': self.decay_style,
-                'end_iter': self.end_iter,
+            'end_iter': self.end_iter,
-                'min_lr': self.min_lr
+            'min_lr': self.min_lr
        }
-        return sd
+        return state_dict
-    def check_and_set_(self, cls_value, sd_value, name):
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
        if self.override_lr_scheduler:
            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
            return cls_value
-        else:
-            if not self.use_checkpoint_lr_scheduler:
+        if not self.use_checkpoint_lr_scheduler:
-                assert cls_value == sd_value, 'AnnealingLR: class input value' \
+            assert cls_value == sd_value, 'AnnealingLR: class input value' \
-                    'and checkpoint values for {} do not match'.format(name)
+                'and checkpoint values for {} do not match'.format(name)
-            print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
+        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
-                                                                      name))
+                                                                  name))
-            return sd_value
+        return sd_value
    def load_state_dict(self, sd):
-        self.start_lr = self.check_and_set_(self.start_lr, sd['start_lr'],
+        self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
                                            'learning rate')
-        self.min_lr = self.check_and_set_(self.min_lr, sd['min_lr'],
+        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
                                          'minimum learning rate')
-        self.warmup_iter = self.check_and_set_(self.warmup_iter,
+        self.warmup_iter = self._check_and_set(self.warmup_iter,
                                               sd['warmup_iter'],
                                               'warmup iterations')
-        self.end_iter = self.check_and_set_(self.end_iter, sd['end_iter'],
+        self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
                                            'total number of iterations')
-        self.decay_style = self.check_and_set_(self.decay_style,
+        self.decay_style = self._check_and_set(self.decay_style,
                                               sd['decay_style'],
                                               'decay style')

--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -17,6 +17,7 @@
 import torch
+from megatron import get_args
 from megatron.module import MegatronModule
 from .language_model import parallel_lm_logits
@@ -106,27 +107,10 @@ class BertLMHead(MegatronModule):
 class BertModel(MegatronModule):
    """Bert Language model."""
-    def __init__(self,
+    def __init__(self, num_tokentypes=2, add_binary_head=True,
-                 num_layers,
+                 ict_head_size=None, parallel_output=True):
-                 vocab_size,
-                 hidden_size,
-                 num_attention_heads,
-                 embedding_dropout_prob,
-                 attention_dropout_prob,
-                 output_dropout_prob,
-                 max_sequence_length,
-                 checkpoint_activations,
-                 checkpoint_num_layers=1,
-                 add_binary_head=False,
-                 ict_head_size=None,
-                 layernorm_epsilon=1.0e-5,
-                 init_method_std=0.02,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 apply_query_key_layer_scaling=False,
-                 attention_softmax_in_fp32=False):
        super(BertModel, self).__init__()
+        args = get_args()
        self.add_binary_head = add_binary_head
        self.ict_head_size = ict_head_size
@@ -134,46 +118,31 @@ class BertModel(MegatronModule):
        assert not (self.add_binary_head and self.add_ict_head)
        self.parallel_output = parallel_output
-        init_method = init_method_normal(init_method_std)
+        init_method = init_method_normal(args.init_method_std)
        add_pooler = self.add_binary_head or self.add_ict_head
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
        self.language_model, self._language_model_key = get_language_model(
-            num_layers=num_layers,
+            attention_mask_func=bert_attention_mask_func,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            embedding_dropout_prob=embedding_dropout_prob,
-            attention_dropout_prob=attention_dropout_prob,
-            output_dropout_prob=output_dropout_prob,
-            max_sequence_length=max_sequence_length,
            num_tokentypes=num_tokentypes,
            add_pooler=add_pooler,
-            attention_mask_func=bert_attention_mask_func,
-            checkpoint_activations=checkpoint_activations,
-            checkpoint_num_layers=checkpoint_num_layers,
-            layernorm_epsilon=layernorm_epsilon,
            init_method=init_method,
-            scaled_init_method=scaled_init_method_normal(init_method_std,
+            scaled_init_method=scaled_init_method)
-                                                         num_layers),
-            residual_connection_post_layernorm=False,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32)
        if not self.add_ict_head:
            self.lm_head = BertLMHead(
                self.language_model.embedding.word_embeddings.weight.size(0),
-                hidden_size, init_method, layernorm_epsilon, parallel_output)
+                args.hidden_size, init_method, args.layernorm_epsilon, parallel_output)
            self._lm_head_key = 'lm_head'
        if self.add_binary_head:
-            self.binary_head = get_linear_layer(hidden_size, 2, init_method)
+            self.binary_head = get_linear_layer(args.hidden_size, 2,
+                                                init_method)
            self._binary_head_key = 'binary_head'
        elif self.add_ict_head:
-            self.ict_head = get_linear_layer(hidden_size, ict_head_size, init_method)
+            self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
            self._ict_head_key = 'ict_head'
-    def forward(self, input_ids, attention_mask,
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
-                tokentype_ids=None):
        extended_attention_mask = bert_extended_attention_mask(
            attention_mask, next(self.language_model.parameters()).dtype)

--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classification model."""
+import torch
+from megatron import get_args
+from megatron.model.bert_model import bert_attention_mask_func
+from megatron.model.bert_model import bert_extended_attention_mask
+from megatron.model.bert_model import bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from megatron.module import MegatronModule
+from megatron import print_rank_0
+class Classification(MegatronModule):
+    def __init__(self, num_classes, num_tokentypes=2):
+        super(Classification, self).__init__()
+        args = get_args()
+        self.num_classes = num_classes
+        init_method = init_method_normal(args.init_method_std)
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=bert_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers))
+        # Multi-choice head.
+        self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
+        self.classification_head = get_linear_layer(args.hidden_size,
+                                                    self.num_classes,
+                                                    init_method)
+        self._classification_head_key = 'classification_head'
+    def forward(self, input_ids, attention_mask, tokentype_ids):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+        _, pooled_output = self.language_model(input_ids,
+                                               position_ids,
+                                               extended_attention_mask,
+                                               tokentype_ids=tokentype_ids)
+        # Output.
+        classification_output = self.classification_dropout(pooled_output)
+        classification_logits = self.classification_head(classification_output)
+        # Reshape back to separate choices.
+        classification_logits = classification_logits.view(-1, self.num_classes)
+        return classification_logits
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        state_dict_[self._classification_head_key] \
+            = self.classification_head.state_dict(
+                destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self._classification_head_key in state_dict:
+            self.classification_head.load_state_dict(
+                state_dict[self._classification_head_key], strict=strict)
+        else:
+            print_rank_0('***WARNING*** could not find {} in the checkpoint, '
+                         'initializing to random'.format(
+                             self._classification_head_key))
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -17,6 +17,7 @@
 import torch
+from megatron import get_args
 from megatron.module import MegatronModule
 from .language_model import parallel_lm_logits
@@ -34,53 +35,24 @@ def gpt2_attention_mask_func(attention_scores, ltor_mask):
 class GPT2Model(MegatronModule):
    """GPT-2 Language model."""
-    def __init__(self,
+    def __init__(self, num_tokentypes=0, parallel_output=True):
-                 num_layers,
-                 vocab_size,
-                 hidden_size,
-                 num_attention_heads,
-                 embedding_dropout_prob,
-                 attention_dropout_prob,
-                 output_dropout_prob,
-                 max_sequence_length,
-                 checkpoint_activations,
-                 checkpoint_num_layers=1,
-                 layernorm_epsilon=1.0e-5,
-                 init_method_std=0.02,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 apply_query_key_layer_scaling=False,
-                 attention_softmax_in_fp32=False):
        super(GPT2Model, self).__init__()
+        args = get_args()
        self.parallel_output = parallel_output
        self.language_model, self._language_model_key = get_language_model(
-            num_layers=num_layers,
+            attention_mask_func=gpt2_attention_mask_func,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            embedding_dropout_prob=embedding_dropout_prob,
-            attention_dropout_prob=attention_dropout_prob,
-            output_dropout_prob=output_dropout_prob,
-            max_sequence_length=max_sequence_length,
            num_tokentypes=num_tokentypes,
            add_pooler=False,
-            attention_mask_func=gpt2_attention_mask_func,
+            init_method=init_method_normal(args.init_method_std),
-            checkpoint_activations=checkpoint_activations,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-            checkpoint_num_layers=checkpoint_num_layers,
+                                                         args.num_layers))
-            layernorm_epsilon=layernorm_epsilon,
-            init_method=init_method_normal(init_method_std),
-            scaled_init_method=scaled_init_method_normal(init_method_std,
-                                                         num_layers),
-            residual_connection_post_layernorm=False,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32)
    def forward(self, input_ids, position_ids, attention_mask,
-                tokentype_ids=None, layer_past=None, get_key_value=False):
+                tokentype_ids=None, layer_past=None, get_key_value=False,
+                forward_method_parallel_output=None):
        # Language model.
        lm_output = self.language_model(input_ids,
@@ -94,10 +66,13 @@ class GPT2Model(MegatronModule):
            lm_output, presents = lm_output
        # Output.
+        parallel_output = self.parallel_output
+        if forward_method_parallel_output is not None:
+            parallel_output = forward_method_parallel_output
        output = parallel_lm_logits(
            lm_output,
            self.language_model.embedding.word_embeddings.weight,
-            self.parallel_output)
+            parallel_output)
        if get_key_value:
            output = [output, presents]

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -18,13 +18,13 @@
 import torch
 import torch.nn.functional as F
+from megatron import get_args
 from megatron import mpu
 from megatron.module import MegatronModule
-from .transformer import ParallelTransformer
+from megatron.model.transformer import ParallelTransformer
-from .transformer import TransformerHyperparameters
+from megatron.model.utils import gelu
-from .utils import gelu
+from megatron.model.utils import get_linear_layer
-from .utils import get_linear_layer
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
@@ -40,52 +40,20 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
    # Gather if needed.
    if parallel_output:
        return logits_parallel
-    else:
-        return mpu.gather_from_model_parallel_region(logits_parallel)
+    return mpu.gather_from_model_parallel_region(logits_parallel)
-def get_language_model(num_layers,
+def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
-                       vocab_size,
+                       init_method, scaled_init_method):
-                       hidden_size,
+    """Build language model and return along with the key to save."""
-                       num_attention_heads,
-                       embedding_dropout_prob,
-                       attention_dropout_prob,
-                       output_dropout_prob,
-                       max_sequence_length,
-                       num_tokentypes,
-                       attention_mask_func,
-                       add_pooler,
-                       checkpoint_activations,
-                       checkpoint_num_layers,
-                       layernorm_epsilon,
-                       init_method,
-                       scaled_init_method,
-                       residual_connection_post_layernorm,
-                       apply_query_key_layer_scaling,
-                       attention_softmax_in_fp32):
-    # Transformer hyperparameters.
-    transformer_hparams = TransformerHyperparameters(
-        hidden_size=hidden_size,
-        num_layers=num_layers,
-        num_attention_heads=num_attention_heads,
-        attention_dropout_prob=attention_dropout_prob,
-        output_dropout_prob=output_dropout_prob,
-        mlp_activation_func=gelu,
-        layernorm_epsilon=layernorm_epsilon,
-        init_method=init_method,
-        output_layer_init_method=scaled_init_method,
-        checkpoint_activations=checkpoint_activations,
-        checkpoint_num_layers=checkpoint_num_layers,
-        apply_residual_connection_post_layernorm=residual_connection_post_layernorm,
-        apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-        attention_softmax_in_fp32=attention_softmax_in_fp32)
    # Language model.
    language_model = TransformerLanguageModel(
-        transformer_hparams=transformer_hparams,
        attention_mask_func=attention_mask_func,
-        vocab_size=vocab_size,
+        mlp_activation_func=gelu,
-        max_sequence_length=max_sequence_length,
+        init_method=init_method,
-        embedding_dropout_prob=embedding_dropout_prob,
+        output_layer_init_method=scaled_init_method,
        num_tokentypes=num_tokentypes,
        add_pooler=add_pooler)
    # key used for checkpoints.
@@ -293,33 +261,33 @@ class TransformerLanguageModel(MegatronModule):
                        will ignore this embedding
    """
    def __init__(self,
-                 transformer_hparams,
                 attention_mask_func,
-                 vocab_size,
+                 mlp_activation_func,
-                 max_sequence_length,
+                 init_method,
-                 embedding_dropout_prob,
+                 output_layer_init_method,
                 num_tokentypes=0,
                 add_pooler=False):
        super(TransformerLanguageModel, self).__init__()
+        args = get_args()
-        self.hidden_size = transformer_hparams['hidden_size']
+        self.hidden_size = args.hidden_size
        self.num_tokentypes = num_tokentypes
-        self.init_method = transformer_hparams['init_method']
+        self.init_method = init_method
        self.add_pooler = add_pooler
        # Embeddings
        self.embedding = Embedding(self.hidden_size,
-                                   vocab_size,
+                                   args.padded_vocab_size,
-                                   max_sequence_length,
+                                   args.max_position_embeddings,
-                                   embedding_dropout_prob,
+                                   args.hidden_dropout,
                                   self.init_method,
                                   self.num_tokentypes)
        self._embedding_key = 'embedding'
        # Transformer
        self.transformer = ParallelTransformer(
-            transformer_hparams,
+            attention_mask_func, mlp_activation_func,
-            attention_mask_func)
+            self.init_method, output_layer_init_method)
        self._transformer_key = 'transformer'
        # Pooler

--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multiple choice model."""
+import torch
+from megatron import get_args
+from megatron.model.bert_model import bert_attention_mask_func
+from megatron.model.bert_model import bert_extended_attention_mask
+from megatron.model.bert_model import bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from megatron.module import MegatronModule
+from megatron import print_rank_0
+class MultipleChoice(MegatronModule):
+    def __init__(self, num_tokentypes=2):
+        super(MultipleChoice, self).__init__()
+        args = get_args()
+        init_method = init_method_normal(args.init_method_std)
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=bert_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers))
+        # Multi-choice head.
+        self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
+        self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                 init_method)
+        self._multichoice_head_key = 'multichoice_head'
+    def forward(self, input_ids, attention_mask, tokentype_ids):
+        # [batch, choices, sequence] --> [batch * choices, sequence] -->
+        #    transformer --> [batch, choices] --> softmax
+        # Ensure the shape is [batch-size, choices, sequence]
+        assert len(input_ids.shape) == 3
+        assert len(attention_mask.shape) == 3
+        assert len(tokentype_ids.shape) == 3
+        # Reshape and treat choice dimension the same as batch.
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+        _, pooled_output = self.language_model(input_ids,
+                                               position_ids,
+                                               extended_attention_mask,
+                                               tokentype_ids=tokentype_ids)
+        # Output.
+        multichoice_output = self.multichoice_dropout(pooled_output)
+        multichoice_logits = self.multichoice_head(multichoice_output)
+        # Reshape back to separate choices.
+        multichoice_logits = multichoice_logits.view(-1, num_choices)
+        return multichoice_logits
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        state_dict_[self._multichoice_head_key] \
+            = self.multichoice_head.state_dict(
+                destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self._multichoice_head_key in state_dict:
+            self.multichoice_head.load_state_dict(
+                state_dict[self._multichoice_head_key], strict=strict)
+        else:
+            print_rank_0('***WARNING*** could not find {} in the checkpoint, '
+                         'initializing to random'.format(
+                             self._multichoice_head_key))
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -20,6 +20,7 @@ import math
 import torch
 from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+from megatron import get_args
 from megatron import mpu
 from megatron.module import MegatronModule
@@ -45,85 +46,6 @@ from megatron.module import MegatronModule
                                     unmaksed-attention-scores, attention-mask)
 """
-class TransformerHyperparameters:
-    """Hyperparameters used to build and run the transformer.
-    Arguments:
-        hidden_size: hidden size (h)
-        num_layers: number of layers (l)
-        num_attention_heads: number of attention heads (n)
-        attention_dropout_prob: dropout probability for the attention
-                                probabiliies
-        output_dropout_prob: dropout probability for the output
-                             layers (attention output and mlp output)
-        mlp_activation_func: activation function for the mlp layer
-        layernorm_epsilon: tolerance parameters used for layer norm
-                           dividions
-        init_method: init method used for all weights except layer
-                     norm and output weights
-        output_layer_init_method: init method for output weights (
-                                  attention output and mlp output)
-        checkpoint_activations: flag to use activation checkpointing
-        checkpoint_num_layers: number of layers use in each chunk of
-                               activation checkpointing
-        apply_residual_connection_post_layernorm: Take the post layer-norm
-            values for resudual connecton. BERT: True, GPT-2: False
-    """
-    def __init__(self,
-                 hidden_size=None,
-                 num_layers=None,
-                 num_attention_heads=None,
-                 attention_dropout_prob=None,
-                 output_dropout_prob=None,
-                 mlp_activation_func=None,
-                 layernorm_epsilon=None,
-                 init_method=None,
-                 output_layer_init_method=None,
-                 checkpoint_activations=None,
-                 checkpoint_num_layers=None,
-                 apply_residual_connection_post_layernorm=None,
-                 apply_query_key_layer_scaling=None,
-                 attention_softmax_in_fp32=None):
-        self.params_dict = {}
-        self.params_dict['hidden_size'] = hidden_size
-        self.params_dict['num_layers'] = num_layers
-        self.params_dict['num_attention_heads'] = num_attention_heads
-        self.params_dict['attention_dropout_prob'] = attention_dropout_prob
-        self.params_dict['output_dropout_prob'] = output_dropout_prob
-        self.params_dict['mlp_activation_func'] = mlp_activation_func
-        self.params_dict['layernorm_epsilon'] = layernorm_epsilon
-        self.params_dict['init_method'] = init_method
-        self.params_dict['output_layer_init_method'] = output_layer_init_method
-        self.params_dict['checkpoint_activations'] = checkpoint_activations
-        self.params_dict['checkpoint_num_layers'] = checkpoint_num_layers
-        self.params_dict['apply_residual_connection_post_layernorm'] \
-            = apply_residual_connection_post_layernorm
-        self.params_dict['apply_query_key_layer_scaling'] \
-            = apply_query_key_layer_scaling
-        self.params_dict['attention_softmax_in_fp32'] \
-            = attention_softmax_in_fp32
-    def __getitem__(self, key):
-        """Custom retrieval with error checks."""
-        try:
-            value = self.params_dict[key]
-        except KeyError:
-            raise Exception(
-                'could not find {} in transformer hyperparameters'.format(key))
-        except Exception as e:
-            print('unexpected error in transformer hyperparameters:', e)
-            raise Exception()
-        else:
-            assert value is not None, \
-                'parameter value for {} is not set in transformer '\
-                'hyperparameters'.format(key)
-            return value
-        raise Exception('should not be here')
 class ParallelMLP(MegatronModule):
    """MLP.
@@ -133,26 +55,28 @@ class ParallelMLP(MegatronModule):
    applied.
    """
-    def __init__(self, hyperparameters):
+    def __init__(self, mlp_activation_func, init_method,
+                 output_layer_init_method):
        super(ParallelMLP, self).__init__()
+        args = get_args()
        # Project to 4h.
        self.dense_h_to_4h = mpu.ColumnParallelLinear(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            4*hyperparameters['hidden_size'],
+            4*args.hidden_size,
            gather_output=False,
-            init_method=hyperparameters['init_method'])
+            init_method=init_method)
-        self.activation_func = hyperparameters['mlp_activation_func']
+        self.activation_func = mlp_activation_func
        # Project back to h.
        self.dense_4h_to_h = mpu.RowParallelLinear(
-            4*hyperparameters['hidden_size'],
+            4*args.hidden_size,
-            hyperparameters['hidden_size'],
+            args.hidden_size,
            input_is_parallel=True,
-            init_method=hyperparameters['output_layer_init_method'])
+            init_method=output_layer_init_method)
-        self.dropout = torch.nn.Dropout(hyperparameters['output_dropout_prob'])
+        self.dropout = torch.nn.Dropout(args.hidden_dropout)
    def forward(self, hidden_states):
@@ -174,51 +98,47 @@ class ParallelSelfAttention(MegatronModule):
    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    """
+    def __init__(self, attention_mask_func, init_method,
-    def __init__(self, hyperparameters, attention_mask_func, layer_number):
+                 output_layer_init_method, layer_number):
        super(ParallelSelfAttention, self).__init__()
+        args = get_args()
        self.attention_mask_func = attention_mask_func
-        self.apply_query_key_layer_scaling \
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
-            = hyperparameters['apply_query_key_layer_scaling']
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
-        self.attention_softmax_in_fp32 \
-            = hyperparameters['attention_softmax_in_fp32']
        if self.apply_query_key_layer_scaling:
            self.attention_softmax_in_fp32 = True
        self.layer_number = max(1, layer_number)
        # Per attention head and per partition values.
        world_size = mpu.get_model_parallel_world_size()
-        self.hidden_size_per_partition = mpu.divide(
+        self.hidden_size_per_partition = mpu.divide(args.hidden_size,
-            hyperparameters['hidden_size'], world_size)
+                                                    world_size)
        self.hidden_size_per_attention_head = mpu.divide(
-            hyperparameters['hidden_size'],
+            args.hidden_size, args.num_attention_heads)
-            hyperparameters['num_attention_heads'])
        self.num_attention_heads_per_partition = mpu.divide(
-            hyperparameters['num_attention_heads'], world_size)
+            args.num_attention_heads, world_size)
        # Strided linear layer.
        self.query_key_value = mpu.ColumnParallelLinear(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            3*hyperparameters['hidden_size'],
+            3*args.hidden_size,
            stride=3,
            gather_output=False,
-            init_method=hyperparameters['init_method'])
+            init_method=init_method)
        # Dropout. Note that for a single iteration, this layer will generate
        # different outputs on different number of parallel partitions but
        # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
-            hyperparameters['attention_dropout_prob'])
        # Output.
        self.dense = mpu.RowParallelLinear(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            hyperparameters['hidden_size'],
+            args.hidden_size,
            input_is_parallel=True,
-            init_method=hyperparameters['output_layer_init_method'])
+            init_method=output_layer_init_method)
-        self.output_dropout = torch.nn.Dropout(
+        self.output_dropout = torch.nn.Dropout(args.hidden_dropout)
-            hyperparameters['output_dropout_prob'])
    def _transpose_for_scores(self, tensor):
@@ -369,30 +289,34 @@ class ParallelTransformerLayer(MegatronModule):
    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.
    """
-    def __init__(self, hyperparameters, attention_mask_func, layer_number):
+    def __init__(self, attention_mask_func, mlp_activation_func,
+                 init_method, output_layer_init_method, layer_number):
+        args = get_args()
        super(ParallelTransformerLayer, self).__init__()
        self.layer_number = layer_number
        self.apply_residual_connection_post_layernorm \
-            = hyperparameters['apply_residual_connection_post_layernorm']
+            = args.apply_residual_connection_post_layernorm
        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            eps=hyperparameters['layernorm_epsilon'])
+            eps=args.layernorm_epsilon)
        # Self attention.
-        self.attention = ParallelSelfAttention(
+        self.attention = ParallelSelfAttention(attention_mask_func, init_method,
-            hyperparameters, attention_mask_func, layer_number)
+                                               output_layer_init_method,
+                                               layer_number)
        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            eps=hyperparameters['layernorm_epsilon'])
+            eps=args.layernorm_epsilon)
        # MLP
-        self.mlp = ParallelMLP(hyperparameters)
+        self.mlp = ParallelMLP(mlp_activation_func, init_method,
+                               output_layer_init_method)
    def forward(self, hidden_states, attention_mask, layer_past=None,
@@ -434,25 +358,28 @@ class ParallelTransformerLayer(MegatronModule):
 class ParallelTransformer(MegatronModule):
    """Transformer class."""
-    def __init__(self, hyperparameters, attention_mask_func):
+    def __init__(self, attention_mask_func, mlp_activation_func,
+                 init_method, output_layer_init_method):
        super(ParallelTransformer, self).__init__()
+        args = get_args()
        # Store activation checkpoiting flag.
-        self.checkpoint_activations = hyperparameters['checkpoint_activations']
+        self.checkpoint_activations = args.checkpoint_activations
-        self.checkpoint_num_layers = hyperparameters['checkpoint_num_layers']
+        self.checkpoint_num_layers = args.checkpoint_num_layers
        def get_layer(layer_number):
            return ParallelTransformerLayer(
-                hyperparameters, attention_mask_func, layer_number)
+                attention_mask_func, mlp_activation_func,
+                init_method, output_layer_init_method, layer_number)
        # Transformer layers.
        self.layers = torch.nn.ModuleList(
-            [get_layer(i+1) for i in range(hyperparameters['num_layers'])])
+            [get_layer(i+1) for i in range(args.num_layers)])
        # Final layer norm before output.
        self.final_layernorm = LayerNorm(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            eps=hyperparameters['layernorm_epsilon'])
+            eps=args.layernorm_epsilon)
    def _checkpointed_forward(self, hidden_states, attention_mask):

--- a/megatron/module.py
+++ b/megatron/module.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Megatron Module"""
 import torch

--- a/generate_samples.py
+++ b/generate_samples.py
--- a/megatron/tokenizer/__init__.py
+++ b/megatron/tokenizer/__init__.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .tokenizer import build_tokenizer
--- a/megatron/data/bert_tokenization.py
+++ b/megatron/data/bert_tokenization.py
--- a/megatron/tokenizer/gpt2_tokenization.py
+++ b/megatron/tokenizer/gpt2_tokenization.py
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+logger = logging.getLogger(__name__)
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            from .file_utils import cached_path
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
+        return tokenizer
+    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file, merge_file, special_tokens_file
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron tokenizers."""
+from abc import ABC
+from abc import abstractmethod
+from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .gpt2_tokenization import GPT2Tokenizer
+def build_tokenizer(args):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type),
+              flush=True)
+    # Select and instantiate the tokenizer.
+    assert args.vocab_file is not None
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                                    lower_case=True)
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    else:
+        raise NotImplementedError('{} tokenizer is not '
+                                  'implemented.'.format(args.tokenizer_type))
+    # Add vocab size.
+    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
+                                                      args)
+    return tokenizer
+def _vocab_size_with_padding(orig_vocab_size, args):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * \
+               args.model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    if args.rank == 0:
+        print(' > padded vocab (size: {}) with {} dummy tokens '
+              '(new size: {})'.format(
+                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+    return after
+class AbstractTokenizer(ABC):
+    """Abstract class for tokenizer."""
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        pass
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+    def detokenize(self, token_ids):
+        raise NotImplementedError('detokenizer is not implemented for {} '
+                                  'tokenizer'.format(self.name))
+    @property
+    def cls(self):
+        raise NotImplementedError('CLS is not provided for {} '
+                                  'tokenizer'.format(self.name))
+    @property
+    def sep(self):
+        raise NotImplementedError('SEP is not provided for {} '
+                                  'tokenizer'.format(self.name))
+    @property
+    def pad(self):
+        raise NotImplementedError('PAD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+    @property
+    def eod(self):
+        raise NotImplementedError('EOD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+    @property
+    def mask(self):
+        raise NotImplementedError('MASK is not provided for {} '
+                                  'tokenizer'.format(self.name))
+class _BertWordPieceTokenizer(AbstractTokenizer):
+    """Original BERT wordpiece tokenizer."""
+    def __init__(self, vocab_file, lower_case=True):
+        if lower_case:
+            name = 'BERT Lower Case'
+        else:
+            name = 'BERT Upper Case'
+        super().__init__(name)
+        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
+        self.cls_id = self.tokenizer.vocab['[CLS]']
+        self.sep_id = self.tokenizer.vocab['[SEP]']
+        self.pad_id = self.tokenizer.vocab['[PAD]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']  
+    @property
+    def vocab_size(self):
+        return self.tokenizer.vocab_size()
+    @property
+    def vocab(self):
+        return self.tokenizer.vocab
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.inv_vocab
+    def tokenize(self, text):
+        text_tokens = self.tokenizer.tokenize(text)
+        return self.tokenizer.convert_tokens_to_ids(text_tokens)
+    @property
+    def cls(self):
+        return self.cls_id
+    @property
+    def sep(self):
+        return self.sep_id
+    @property
+    def pad(self):
+        return self.pad_id
+    @property
+    def mask(self):
+        return self.mask_id
+class _GPT2BPETokenizer(AbstractTokenizer):
+    """Original GPT2 BPE tokenizer."""
+    def __init__(self, vocab_file, merge_file):
+        name = 'GPT2 BPE'
+        super().__init__(name)
+        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
+                                       special_tokens=[], max_len=None)
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+    @property
+    def eod(self):
+        return self.eod_id
--- a/megatron/training.py
+++ b/megatron/training.py
--- a/megatron/utils.py
+++ b/megatron/utils.py
--- a/pretrain_albert.py
+++ b/pretrain_albert.py
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py