Merge branch 'staging_args' into 'staging'

Major refactor: args, global variables, tokenizer See merge request ADLR/megatron-lm!36

Merge branch 'staging_args' into 'staging'
Major refactor: args, global variables, tokenizer See merge request ADLR/megatron-lm!36
6c521eb1 · Jared Casper · 37ff534f · b07f1d0a · 6c521eb1 · 6c521eb1
Commit 6c521eb1 authored Apr 02, 2020 by Jared Casper
20 changed files
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron global variables."""
+import os
+import sys
+import time
+import torch
+from megatron.tokenizer import build_tokenizer
+from .arguments import parse_args
+_GLOBAL_ARGS = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_ADLR_AUTORESUME = None
+_GLOBAL_TIMERS = None
+def get_args():
+    """Return arguments."""
+    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+    return _GLOBAL_ARGS
+def get_tokenizer():
+    """Return tokenizer."""
+    _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    return _GLOBAL_TOKENIZER
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+def get_adlr_autoresume():
+    """ADLR autoresume object. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ADLR_AUTORESUME
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+def set_global_variables(extra_args_provider=None, args_defaults={}):
+    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
+    args = _parse_args(extra_args_provider=extra_args_provider,
+                       defaults=args_defaults)
+    _build_tokenizer(args)
+    _set_tensorboard_writer(args)
+    _set_adlr_autoresume(args)
+    _set_timers()
+def _parse_args(extra_args_provider=None, defaults={}):
+    """Parse entire arguments."""
+    global _GLOBAL_ARGS
+    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
+    _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,
+                              defaults=defaults)
+    return _GLOBAL_ARGS
+def _build_tokenizer(args):
+    """Initialize tokenizer."""
+    global _GLOBAL_TOKENIZER
+    _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    _GLOBAL_TOKENIZER = build_tokenizer(args)
+def _set_tensorboard_writer(args):
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
+                                   'tensorboard writer')
+    if hasattr(args, 'tensorboard_dir') and \
+       args.tensorboard_dir and args.rank == 0:
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            print('> setting tensorboard ...')
+            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
+                log_dir=args.tensorboard_dir)
+        except ModuleNotFoundError:
+            print('WARNING: TensorBoard writing requested but is not '
+                  'available (are you using PyTorch 1.1.0 or later?), '
+                  'no TensorBoard logs will be written.', flush=True)
+def _set_adlr_autoresume(args):
+    """Initialize ADLR autoresume."""
+    global _GLOBAL_ADLR_AUTORESUME
+    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
+    if args.adlr_autoresume:
+        if args.rank == 0:
+            print('enabling autoresume ...', flush=True)
+        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
+        try:
+            from userlib.auto_resume import AutoResume
+        except:
+            print('ADLR autoresume is not available, exiting ...')
+            sys.exit()
+        _GLOBAL_ADLR_AUTORESUME = AutoResume
+def _set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers()
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
+class _Timer:
+    """Timer."""
+    def __init__(self, name):
+        self.name_ = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+    def start(self):
+        """Start the timer."""
+        assert not self.started_, 'timer has already been started'
+        torch.cuda.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+    def stop(self):
+        """Stop the timer."""
+        assert self.started_, 'timer is not started'
+        torch.cuda.synchronize()
+        self.elapsed_ += (time.time() - self.start_time)
+        self.started_ = False
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+class Timers:
+    """Group of timers."""
+    def __init__(self):
+        self.timers = {}
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
+        """Write timers to a tensorboard writer"""
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+            writer.add_scalar(name + '_time', value, iteration)
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == 0:
+                print(string, flush=True)
+        else:
+            print(string, flush=True)
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron initialization."""
+import random
+import os
+import numpy as np
+import torch
+from megatron import get_adlr_autoresume
+from megatron import get_args
+from megatron import get_tensorboard_writer
+from megatron import mpu
+from megatron.global_vars import set_global_variables
+def initialize_megatron(extra_args_provider=None, args_defaults={}):
+    """Set global variables, initialize distributed, and
+    set autoresume and random seeds."""
+    # Make sure cuda is available.
+    assert torch.cuda.is_available(), 'Megatron requires CUDA.'
+    # Parse args, build tokenizer, and set adlr-autoresume,
+    # tensorboard-writer, and timers.
+    set_global_variables(extra_args_provider=extra_args_provider,
+                         args_defaults=args_defaults)
+    # Pytorch distributed.
+    _initialize_distributed()
+    # Autoresume.
+    _init_autoresume()
+    # Random seeds for reproducibility.
+    args = get_args()
+    if args.rank == 0:
+        print('> setting random seeds to {} ...'.format(args.seed))
+    _set_random_seed(args.seed)
+    # Write arguments to tensorboard.
+    _write_args_to_tensorboard()
+def _initialize_distributed():
+    """Initialize torch.distributed and mpu."""
+    args = get_args()
+    if torch.distributed.is_initialized():
+        if args.rank == 0:
+            print('torch distributed is already initialized, '
+                  'skipping initialization ...', flush=True)
+        args.rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+        device = torch.cuda.current_device()
+        local_rank = args.rank % torch.cuda.device_count()
+        assert local_rank == device, \
+            'expected local-rank to be the same as rank % device-count.'
+    else:
+        if args.rank == 0:
+            print('> initializing torch distributed ...', flush=True)
+        # Manually set the device ids.
+        device = args.rank % torch.cuda.device_count()
+        if args.local_rank is not None:
+            assert args.local_rank == device, \
+                'expected local-rank to be the same as rank % device-count.'
+        else:
+            args.local_rank = device
+        torch.cuda.set_device(device)
+        # Call the init process
+        init_method = 'tcp://'
+        master_ip = os.getenv('MASTER_ADDR', 'localhost')
+        master_port = os.getenv('MASTER_PORT', '6000')
+        init_method += master_ip + ':' + master_port
+        torch.distributed.init_process_group(
+            backend=args.distributed_backend,
+            world_size=args.world_size, rank=args.rank,
+            init_method=init_method)
+    # Set the model-parallel / data-parallel communicators.
+    mpu.initialize_model_parallel(args.model_parallel_size)
+def _init_autoresume():
+    """Set autoresume start time."""
+    autoresume = get_adlr_autoresume()
+    if autoresume:
+        torch.distributed.barrier()
+        autoresume.init()
+        torch.distributed.barrier()
+def _set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
+def _write_args_to_tensorboard():
+    """Write arguments to tensorboard."""
+    args = get_args()
+    writer = get_tensorboard_writer()
+    if writer:
+        for arg in vars(args):
+            writer.add_text(arg, str(getattr(args, arg)))
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -12,59 +12,68 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch DataLoader for TFRecords"""
-import torch
+"""Learning rate decay functions."""
-from torch.optim.lr_scheduler import _LRScheduler
-import math
-from megatron.utils import print_rank_0
+import math
+from megatron import print_rank_0
-class AnnealingLR(_LRScheduler):
-    """Anneals the learning rate"""
-    DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None']
+class AnnealingLR(object):
+    """Anneals the learning rate."""
-    def __init__(self, optimizer, start_lr, warmup_iter, num_iters,
+    def __init__(self, optimizer, start_lr,
-                 decay_style=None, last_iter=-1, min_lr=0.0,
+                 warmup_iter, total_iters,
+                 decay_style, last_iter, min_lr=0.0,
                 use_checkpoint_lr_scheduler=True,
                 override_lr_scheduler=False):
+        # Class values.
        self.optimizer = optimizer
        self.start_lr = start_lr
        self.min_lr = min_lr
        self.warmup_iter = warmup_iter
-        self.num_iters = last_iter + 1
+        self.num_iters = last_iter
-        self.end_iter = num_iters
+        self.end_iter = total_iters
-        self.decay_style = decay_style.lower() if isinstance(decay_style, str) \
+        assert self.end_iter > 0
-                           else None
+        self.decay_style = decay_style
        self.override_lr_scheduler = override_lr_scheduler
        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
        if self.override_lr_scheduler:
            assert not self.use_checkpoint_lr_scheduler, 'both override and '\
                'use-checkpoint are set.'
+        # Set the learning rate
        self.step(self.num_iters)
-        if torch.distributed.get_rank() == 0:
-            print('learning rate decaying', decay_style)
+        print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
    def get_lr(self):
-        # https://openreview.net/pdf?id=BJYwwY9ll pg. 4
+        """Learning rate decay functions from:
+              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
        num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
+        # Warmup.
        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
            return float(self.start_lr) * num_iters_ / self.warmup_iter
+        num_iters_ = num_iters_ - self.warmup_iter
+        if self.decay_style == 'linear':
+            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+        elif self.decay_style == 'cosine':
+            lr = self.start_lr / 2.0 * (math.cos(
+                math.pi * num_iters_ / self.end_iter) + 1)
+        elif self.decay_style == 'exponential':
+            # exp(-0.693) = 1/2
+            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
        else:
-            if self.decay_style == self.DECAY_STYLES[0]:
+            lr = self.start_lr
-                lr = self.start_lr * ((self.end_iter - (num_iters_ - self.warmup_iter)) / self.end_iter)
+        return max(lr, self.min_lr)
-            elif self.decay_style == self.DECAY_STYLES[1]:
-                lr = self.start_lr / 2.0 * (math.cos(math.pi * (num_iters_ - self.warmup_iter) / self.end_iter) + 1)
-            elif self.decay_style == self.DECAY_STYLES[2]:
-                # exp(-0.693) = 1/2
-                lr = self.start_lr * math.exp(-0.693 * (num_iters_ - self.warmup_iter) / self.end_iter)
-            else:
-                lr = self.start_lr
-            return max(lr, self.min_lr)
    def step(self, step_num=None):
+        """Set lr for all parameters groups."""
        if step_num is None:
            step_num = self.num_iters + 1
        self.num_iters = step_num
@@ -72,42 +81,46 @@ class AnnealingLR(_LRScheduler):
        for group in self.optimizer.param_groups:
            group['lr'] = new_lr
    def state_dict(self):
-        sd = {
+        state_dict = {
-                'start_lr': self.start_lr,
+            'start_lr': self.start_lr,
-                'warmup_iter': self.warmup_iter,
+            'warmup_iter': self.warmup_iter,
-                'num_iters': self.num_iters,
+            'num_iters': self.num_iters,
-                'decay_style': self.decay_style,
+            'decay_style': self.decay_style,
-                'end_iter': self.end_iter,
+            'end_iter': self.end_iter,
-                'min_lr': self.min_lr
+            'min_lr': self.min_lr
        }
-        return sd
+        return state_dict
-    def check_and_set_(self, cls_value, sd_value, name):
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
        if self.override_lr_scheduler:
            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
            return cls_value
-        else:
-            if not self.use_checkpoint_lr_scheduler:
+        if not self.use_checkpoint_lr_scheduler:
-                assert cls_value == sd_value, 'AnnealingLR: class input value' \
+            assert cls_value == sd_value, 'AnnealingLR: class input value' \
-                    'and checkpoint values for {} do not match'.format(name)
+                'and checkpoint values for {} do not match'.format(name)
-            print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
+        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
-                                                                      name))
+                                                                  name))
-            return sd_value
+        return sd_value
    def load_state_dict(self, sd):
-        self.start_lr = self.check_and_set_(self.start_lr, sd['start_lr'],
+        self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
                                            'learning rate')
-        self.min_lr = self.check_and_set_(self.min_lr, sd['min_lr'],
+        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
                                          'minimum learning rate')
-        self.warmup_iter = self.check_and_set_(self.warmup_iter,
+        self.warmup_iter = self._check_and_set(self.warmup_iter,
                                               sd['warmup_iter'],
                                               'warmup iterations')
-        self.end_iter = self.check_and_set_(self.end_iter, sd['end_iter'],
+        self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
                                            'total number of iterations')
-        self.decay_style = self.check_and_set_(self.decay_style,
+        self.decay_style = self._check_and_set(self.decay_style,
                                               sd['decay_style'],
                                               'decay style')

--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -17,6 +17,7 @@
 import torch
+from megatron import get_args
 from megatron.module import MegatronModule
 from .language_model import parallel_lm_logits
@@ -106,60 +107,33 @@ class BertLMHead(MegatronModule):
 class BertModel(MegatronModule):
    """Bert Language model."""
-    def __init__(self,
+    def __init__(self, num_tokentypes=2, add_binary_head=True,
-                 num_layers,
+                 parallel_output=True):
-                 vocab_size,
-                 hidden_size,
-                 num_attention_heads,
-                 embedding_dropout_prob,
-                 attention_dropout_prob,
-                 output_dropout_prob,
-                 max_sequence_length,
-                 checkpoint_activations,
-                 checkpoint_num_layers=1,
-                 add_binary_head=False,
-                 layernorm_epsilon=1.0e-5,
-                 init_method_std=0.02,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 apply_query_key_layer_scaling=False,
-                 attention_softmax_in_fp32=False):
        super(BertModel, self).__init__()
+        args = get_args()
        self.add_binary_head = add_binary_head
        self.parallel_output = parallel_output
-        init_method = init_method_normal(init_method_std)
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
        self.language_model, self._language_model_key = get_language_model(
-            num_layers=num_layers,
+            attention_mask_func=bert_attention_mask_func,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            embedding_dropout_prob=embedding_dropout_prob,
-            attention_dropout_prob=attention_dropout_prob,
-            output_dropout_prob=output_dropout_prob,
-            max_sequence_length=max_sequence_length,
            num_tokentypes=num_tokentypes,
            add_pooler=self.add_binary_head,
-            attention_mask_func=bert_attention_mask_func,
-            checkpoint_activations=checkpoint_activations,
-            checkpoint_num_layers=checkpoint_num_layers,
-            layernorm_epsilon=layernorm_epsilon,
            init_method=init_method,
-            scaled_init_method=scaled_init_method_normal(init_method_std,
+            scaled_init_method=scaled_init_method)
-                                                         num_layers),
-            residual_connection_post_layernorm=False,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32)
        self.lm_head = BertLMHead(
            self.language_model.embedding.word_embeddings.weight.size(0),
-            hidden_size, init_method, layernorm_epsilon, parallel_output)
+            args.hidden_size, init_method, args.layernorm_epsilon,
+            parallel_output)
        self._lm_head_key = 'lm_head'
        if self.add_binary_head:
-            self.binary_head = get_linear_layer(hidden_size, 2, init_method)
+            self.binary_head = get_linear_layer(args.hidden_size, 2,
+                                                init_method)
            self._binary_head_key = 'binary_head'

--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
@@ -17,6 +17,7 @@
 import torch
+from megatron import get_args
 from megatron.model.bert_model import bert_attention_mask_func
 from megatron.model.bert_model import bert_extended_attention_mask
 from megatron.model.bert_model import bert_position_ids
@@ -25,59 +26,29 @@ from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
 from megatron.module import MegatronModule
-from megatron.utils import print_rank_0
+from megatron import print_rank_0
 class Classification(MegatronModule):
-    def __init__(self,
+    def __init__(self, num_classes, num_tokentypes=2):
-                 num_classes,
-                 num_layers,
-                 vocab_size,
-                 hidden_size,
-                 num_attention_heads,
-                 embedding_dropout_prob,
-                 attention_dropout_prob,
-                 output_dropout_prob,
-                 max_sequence_length,
-                 checkpoint_activations,
-                 checkpoint_num_layers=1,
-                 layernorm_epsilon=1.0e-5,
-                 init_method_std=0.02,
-                 num_tokentypes=2,
-                 apply_query_key_layer_scaling=False,
-                 attention_softmax_in_fp32=False):
        super(Classification, self).__init__()
+        args = get_args()
        self.num_classes = num_classes
-        init_method = init_method_normal(init_method_std)
+        init_method = init_method_normal(args.init_method_std)
        self.language_model, self._language_model_key = get_language_model(
-            num_layers=num_layers,
+            attention_mask_func=bert_attention_mask_func,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            embedding_dropout_prob=embedding_dropout_prob,
-            attention_dropout_prob=attention_dropout_prob,
-            output_dropout_prob=output_dropout_prob,
-            max_sequence_length=max_sequence_length,
            num_tokentypes=num_tokentypes,
            add_pooler=True,
-            attention_mask_func=bert_attention_mask_func,
-            checkpoint_activations=checkpoint_activations,
-            checkpoint_num_layers=checkpoint_num_layers,
-            layernorm_epsilon=layernorm_epsilon,
            init_method=init_method,
-            scaled_init_method=scaled_init_method_normal(init_method_std,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-                                                         num_layers),
+                                                         args.num_layers))
-                        residual_connection_post_layernorm=False,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32)
        # Multi-choice head.
-        self.classification_dropout = torch.nn.Dropout(output_dropout_prob)
+        self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
-        self.classification_head = get_linear_layer(hidden_size,
+        self.classification_head = get_linear_layer(args.hidden_size,
                                                    self.num_classes,
                                                    init_method)
        self._classification_head_key = 'classification_head'

--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -17,6 +17,7 @@
 import torch
+from megatron import get_args
 from megatron.module import MegatronModule
 from .language_model import parallel_lm_logits
@@ -34,49 +35,19 @@ def gpt2_attention_mask_func(attention_scores, ltor_mask):
 class GPT2Model(MegatronModule):
    """GPT-2 Language model."""
-    def __init__(self,
+    def __init__(self, num_tokentypes=0, parallel_output=True):
-                 num_layers,
-                 vocab_size,
-                 hidden_size,
-                 num_attention_heads,
-                 embedding_dropout_prob,
-                 attention_dropout_prob,
-                 output_dropout_prob,
-                 max_sequence_length,
-                 checkpoint_activations,
-                 checkpoint_num_layers=1,
-                 layernorm_epsilon=1.0e-5,
-                 init_method_std=0.02,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 apply_query_key_layer_scaling=False,
-                 attention_softmax_in_fp32=False):
        super(GPT2Model, self).__init__()
+        args = get_args()
        self.parallel_output = parallel_output
        self.language_model, self._language_model_key = get_language_model(
-            num_layers=num_layers,
+            attention_mask_func=gpt2_attention_mask_func,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            embedding_dropout_prob=embedding_dropout_prob,
-            attention_dropout_prob=attention_dropout_prob,
-            output_dropout_prob=output_dropout_prob,
-            max_sequence_length=max_sequence_length,
            num_tokentypes=num_tokentypes,
            add_pooler=False,
-            attention_mask_func=gpt2_attention_mask_func,
+            init_method=init_method_normal(args.init_method_std),
-            checkpoint_activations=checkpoint_activations,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-            checkpoint_num_layers=checkpoint_num_layers,
+                                                         args.num_layers))
-            layernorm_epsilon=layernorm_epsilon,
-            init_method=init_method_normal(init_method_std),
-            scaled_init_method=scaled_init_method_normal(init_method_std,
-                                                         num_layers),
-            residual_connection_post_layernorm=False,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32)
    def forward(self, input_ids, position_ids, attention_mask,

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -18,13 +18,13 @@
 import torch
 import torch.nn.functional as F
+from megatron import get_args
 from megatron import mpu
 from megatron.module import MegatronModule
-from .transformer import ParallelTransformer
+from megatron.model.transformer import ParallelTransformer
-from .transformer import TransformerHyperparameters
+from megatron.model.utils import gelu
-from .utils import gelu
+from megatron.model.utils import get_linear_layer
-from .utils import get_linear_layer
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
@@ -40,52 +40,20 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
    # Gather if needed.
    if parallel_output:
        return logits_parallel
-    else:
-        return mpu.gather_from_model_parallel_region(logits_parallel)
+    return mpu.gather_from_model_parallel_region(logits_parallel)
-def get_language_model(num_layers,
+def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
-                       vocab_size,
+                       init_method, scaled_init_method):
-                       hidden_size,
+    """Build language model and return along with the key to save."""
-                       num_attention_heads,
-                       embedding_dropout_prob,
-                       attention_dropout_prob,
-                       output_dropout_prob,
-                       max_sequence_length,
-                       num_tokentypes,
-                       attention_mask_func,
-                       add_pooler,
-                       checkpoint_activations,
-                       checkpoint_num_layers,
-                       layernorm_epsilon,
-                       init_method,
-                       scaled_init_method,
-                       residual_connection_post_layernorm,
-                       apply_query_key_layer_scaling,
-                       attention_softmax_in_fp32):
-    # Transformer hyperparameters.
-    transformer_hparams = TransformerHyperparameters(
-        hidden_size=hidden_size,
-        num_layers=num_layers,
-        num_attention_heads=num_attention_heads,
-        attention_dropout_prob=attention_dropout_prob,
-        output_dropout_prob=output_dropout_prob,
-        mlp_activation_func=gelu,
-        layernorm_epsilon=layernorm_epsilon,
-        init_method=init_method,
-        output_layer_init_method=scaled_init_method,
-        checkpoint_activations=checkpoint_activations,
-        checkpoint_num_layers=checkpoint_num_layers,
-        apply_residual_connection_post_layernorm=residual_connection_post_layernorm,
-        apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-        attention_softmax_in_fp32=attention_softmax_in_fp32)
    # Language model.
    language_model = TransformerLanguageModel(
-        transformer_hparams=transformer_hparams,
        attention_mask_func=attention_mask_func,
-        vocab_size=vocab_size,
+        mlp_activation_func=gelu,
-        max_sequence_length=max_sequence_length,
+        init_method=init_method,
-        embedding_dropout_prob=embedding_dropout_prob,
+        output_layer_init_method=scaled_init_method,
        num_tokentypes=num_tokentypes,
        add_pooler=add_pooler)
    # key used for checkpoints.
@@ -293,33 +261,33 @@ class TransformerLanguageModel(MegatronModule):
                        will ignore this embedding
    """
    def __init__(self,
-                 transformer_hparams,
                 attention_mask_func,
-                 vocab_size,
+                 mlp_activation_func,
-                 max_sequence_length,
+                 init_method,
-                 embedding_dropout_prob,
+                 output_layer_init_method,
                 num_tokentypes=0,
                 add_pooler=False):
        super(TransformerLanguageModel, self).__init__()
+        args = get_args()
-        self.hidden_size = transformer_hparams['hidden_size']
+        self.hidden_size = args.hidden_size
        self.num_tokentypes = num_tokentypes
-        self.init_method = transformer_hparams['init_method']
+        self.init_method = init_method
        self.add_pooler = add_pooler
        # Embeddings
        self.embedding = Embedding(self.hidden_size,
-                                   vocab_size,
+                                   args.padded_vocab_size,
-                                   max_sequence_length,
+                                   args.max_position_embeddings,
-                                   embedding_dropout_prob,
+                                   args.hidden_dropout,
                                   self.init_method,
                                   self.num_tokentypes)
        self._embedding_key = 'embedding'
        # Transformer
        self.transformer = ParallelTransformer(
-            transformer_hparams,
+            attention_mask_func, mlp_activation_func,
-            attention_mask_func)
+            self.init_method, output_layer_init_method)
        self._transformer_key = 'transformer'
        # Pooler

--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
@@ -17,6 +17,7 @@
 import torch
+from megatron import get_args
 from megatron.model.bert_model import bert_attention_mask_func
 from megatron.model.bert_model import bert_extended_attention_mask
 from megatron.model.bert_model import bert_position_ids
@@ -25,57 +26,29 @@ from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
 from megatron.module import MegatronModule
-from megatron.utils import print_rank_0
+from megatron import print_rank_0
 class MultipleChoice(MegatronModule):
-    def __init__(self,
+    def __init__(self, num_tokentypes=2):
-                 num_layers,
-                 vocab_size,
-                 hidden_size,
-                 num_attention_heads,
-                 embedding_dropout_prob,
-                 attention_dropout_prob,
-                 output_dropout_prob,
-                 max_sequence_length,
-                 checkpoint_activations,
-                 checkpoint_num_layers=1,
-                 layernorm_epsilon=1.0e-5,
-                 init_method_std=0.02,
-                 num_tokentypes=2,
-                 apply_query_key_layer_scaling=False,
-                 attention_softmax_in_fp32=False):
        super(MultipleChoice, self).__init__()
+        args = get_args()
-        init_method = init_method_normal(init_method_std)
+        init_method = init_method_normal(args.init_method_std)
        self.language_model, self._language_model_key = get_language_model(
-            num_layers=num_layers,
+            attention_mask_func=bert_attention_mask_func,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            embedding_dropout_prob=embedding_dropout_prob,
-            attention_dropout_prob=attention_dropout_prob,
-            output_dropout_prob=output_dropout_prob,
-            max_sequence_length=max_sequence_length,
            num_tokentypes=num_tokentypes,
            add_pooler=True,
-            attention_mask_func=bert_attention_mask_func,
-            checkpoint_activations=checkpoint_activations,
-            checkpoint_num_layers=checkpoint_num_layers,
-            layernorm_epsilon=layernorm_epsilon,
            init_method=init_method,
-            scaled_init_method=scaled_init_method_normal(init_method_std,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-                                                         num_layers),
+                                                         args.num_layers))
-            residual_connection_post_layernorm=False,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32)
        # Multi-choice head.
-        self.multichoice_dropout = torch.nn.Dropout(output_dropout_prob)
+        self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
-        self.multichoice_head = get_linear_layer(hidden_size, 1, init_method)
+        self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                 init_method)
        self._multichoice_head_key = 'multichoice_head'

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -20,6 +20,7 @@ import math
 import torch
 from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+from megatron import get_args
 from megatron import mpu
 from megatron.module import MegatronModule
@@ -45,85 +46,6 @@ from megatron.module import MegatronModule
                                     unmaksed-attention-scores, attention-mask)
 """
-class TransformerHyperparameters:
-    """Hyperparameters used to build and run the transformer.
-    Arguments:
-        hidden_size: hidden size (h)
-        num_layers: number of layers (l)
-        num_attention_heads: number of attention heads (n)
-        attention_dropout_prob: dropout probability for the attention
-                                probabiliies
-        output_dropout_prob: dropout probability for the output
-                             layers (attention output and mlp output)
-        mlp_activation_func: activation function for the mlp layer
-        layernorm_epsilon: tolerance parameters used for layer norm
-                           dividions
-        init_method: init method used for all weights except layer
-                     norm and output weights
-        output_layer_init_method: init method for output weights (
-                                  attention output and mlp output)
-        checkpoint_activations: flag to use activation checkpointing
-        checkpoint_num_layers: number of layers use in each chunk of
-                               activation checkpointing
-        apply_residual_connection_post_layernorm: Take the post layer-norm
-            values for resudual connecton. BERT: True, GPT-2: False
-    """
-    def __init__(self,
-                 hidden_size=None,
-                 num_layers=None,
-                 num_attention_heads=None,
-                 attention_dropout_prob=None,
-                 output_dropout_prob=None,
-                 mlp_activation_func=None,
-                 layernorm_epsilon=None,
-                 init_method=None,
-                 output_layer_init_method=None,
-                 checkpoint_activations=None,
-                 checkpoint_num_layers=None,
-                 apply_residual_connection_post_layernorm=None,
-                 apply_query_key_layer_scaling=None,
-                 attention_softmax_in_fp32=None):
-        self.params_dict = {}
-        self.params_dict['hidden_size'] = hidden_size
-        self.params_dict['num_layers'] = num_layers
-        self.params_dict['num_attention_heads'] = num_attention_heads
-        self.params_dict['attention_dropout_prob'] = attention_dropout_prob
-        self.params_dict['output_dropout_prob'] = output_dropout_prob
-        self.params_dict['mlp_activation_func'] = mlp_activation_func
-        self.params_dict['layernorm_epsilon'] = layernorm_epsilon
-        self.params_dict['init_method'] = init_method
-        self.params_dict['output_layer_init_method'] = output_layer_init_method
-        self.params_dict['checkpoint_activations'] = checkpoint_activations
-        self.params_dict['checkpoint_num_layers'] = checkpoint_num_layers
-        self.params_dict['apply_residual_connection_post_layernorm'] \
-            = apply_residual_connection_post_layernorm
-        self.params_dict['apply_query_key_layer_scaling'] \
-            = apply_query_key_layer_scaling
-        self.params_dict['attention_softmax_in_fp32'] \
-            = attention_softmax_in_fp32
-    def __getitem__(self, key):
-        """Custom retrieval with error checks."""
-        try:
-            value = self.params_dict[key]
-        except KeyError:
-            raise Exception(
-                'could not find {} in transformer hyperparameters'.format(key))
-        except Exception as e:
-            print('unexpected error in transformer hyperparameters:', e)
-            raise Exception()
-        else:
-            assert value is not None, \
-                'parameter value for {} is not set in transformer '\
-                'hyperparameters'.format(key)
-            return value
-        raise Exception('should not be here')
 class ParallelMLP(MegatronModule):
    """MLP.
@@ -133,26 +55,28 @@ class ParallelMLP(MegatronModule):
    applied.
    """
-    def __init__(self, hyperparameters):
+    def __init__(self, mlp_activation_func, init_method,
+                 output_layer_init_method):
        super(ParallelMLP, self).__init__()
+        args = get_args()
        # Project to 4h.
        self.dense_h_to_4h = mpu.ColumnParallelLinear(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            4*hyperparameters['hidden_size'],
+            4*args.hidden_size,
            gather_output=False,
-            init_method=hyperparameters['init_method'])
+            init_method=init_method)
-        self.activation_func = hyperparameters['mlp_activation_func']
+        self.activation_func = mlp_activation_func
        # Project back to h.
        self.dense_4h_to_h = mpu.RowParallelLinear(
-            4*hyperparameters['hidden_size'],
+            4*args.hidden_size,
-            hyperparameters['hidden_size'],
+            args.hidden_size,
            input_is_parallel=True,
-            init_method=hyperparameters['output_layer_init_method'])
+            init_method=output_layer_init_method)
-        self.dropout = torch.nn.Dropout(hyperparameters['output_dropout_prob'])
+        self.dropout = torch.nn.Dropout(args.hidden_dropout)
    def forward(self, hidden_states):
@@ -174,51 +98,47 @@ class ParallelSelfAttention(MegatronModule):
    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    """
+    def __init__(self, attention_mask_func, init_method,
-    def __init__(self, hyperparameters, attention_mask_func, layer_number):
+                 output_layer_init_method, layer_number):
        super(ParallelSelfAttention, self).__init__()
+        args = get_args()
        self.attention_mask_func = attention_mask_func
-        self.apply_query_key_layer_scaling \
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
-            = hyperparameters['apply_query_key_layer_scaling']
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
-        self.attention_softmax_in_fp32 \
-            = hyperparameters['attention_softmax_in_fp32']
        if self.apply_query_key_layer_scaling:
            self.attention_softmax_in_fp32 = True
        self.layer_number = max(1, layer_number)
        # Per attention head and per partition values.
        world_size = mpu.get_model_parallel_world_size()
-        self.hidden_size_per_partition = mpu.divide(
+        self.hidden_size_per_partition = mpu.divide(args.hidden_size,
-            hyperparameters['hidden_size'], world_size)
+                                                    world_size)
        self.hidden_size_per_attention_head = mpu.divide(
-            hyperparameters['hidden_size'],
+            args.hidden_size, args.num_attention_heads)
-            hyperparameters['num_attention_heads'])
        self.num_attention_heads_per_partition = mpu.divide(
-            hyperparameters['num_attention_heads'], world_size)
+            args.num_attention_heads, world_size)
        # Strided linear layer.
        self.query_key_value = mpu.ColumnParallelLinear(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            3*hyperparameters['hidden_size'],
+            3*args.hidden_size,
            stride=3,
            gather_output=False,
-            init_method=hyperparameters['init_method'])
+            init_method=init_method)
        # Dropout. Note that for a single iteration, this layer will generate
        # different outputs on different number of parallel partitions but
        # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
-            hyperparameters['attention_dropout_prob'])
        # Output.
        self.dense = mpu.RowParallelLinear(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            hyperparameters['hidden_size'],
+            args.hidden_size,
            input_is_parallel=True,
-            init_method=hyperparameters['output_layer_init_method'])
+            init_method=output_layer_init_method)
-        self.output_dropout = torch.nn.Dropout(
+        self.output_dropout = torch.nn.Dropout(args.hidden_dropout)
-            hyperparameters['output_dropout_prob'])
    def _transpose_for_scores(self, tensor):
@@ -369,30 +289,34 @@ class ParallelTransformerLayer(MegatronModule):
    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.
    """
-    def __init__(self, hyperparameters, attention_mask_func, layer_number):
+    def __init__(self, attention_mask_func, mlp_activation_func,
+                 init_method, output_layer_init_method, layer_number):
+        args = get_args()
        super(ParallelTransformerLayer, self).__init__()
        self.layer_number = layer_number
        self.apply_residual_connection_post_layernorm \
-            = hyperparameters['apply_residual_connection_post_layernorm']
+            = args.apply_residual_connection_post_layernorm
        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            eps=hyperparameters['layernorm_epsilon'])
+            eps=args.layernorm_epsilon)
        # Self attention.
-        self.attention = ParallelSelfAttention(
+        self.attention = ParallelSelfAttention(attention_mask_func, init_method,
-            hyperparameters, attention_mask_func, layer_number)
+                                               output_layer_init_method,
+                                               layer_number)
        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            eps=hyperparameters['layernorm_epsilon'])
+            eps=args.layernorm_epsilon)
        # MLP
-        self.mlp = ParallelMLP(hyperparameters)
+        self.mlp = ParallelMLP(mlp_activation_func, init_method,
+                               output_layer_init_method)
    def forward(self, hidden_states, attention_mask, layer_past=None,
@@ -434,25 +358,28 @@ class ParallelTransformerLayer(MegatronModule):
 class ParallelTransformer(MegatronModule):
    """Transformer class."""
-    def __init__(self, hyperparameters, attention_mask_func):
+    def __init__(self, attention_mask_func, mlp_activation_func,
+                 init_method, output_layer_init_method):
        super(ParallelTransformer, self).__init__()
+        args = get_args()
        # Store activation checkpoiting flag.
-        self.checkpoint_activations = hyperparameters['checkpoint_activations']
+        self.checkpoint_activations = args.checkpoint_activations
-        self.checkpoint_num_layers = hyperparameters['checkpoint_num_layers']
+        self.checkpoint_num_layers = args.checkpoint_num_layers
        def get_layer(layer_number):
            return ParallelTransformerLayer(
-                hyperparameters, attention_mask_func, layer_number)
+                attention_mask_func, mlp_activation_func,
+                init_method, output_layer_init_method, layer_number)
        # Transformer layers.
        self.layers = torch.nn.ModuleList(
-            [get_layer(i+1) for i in range(hyperparameters['num_layers'])])
+            [get_layer(i+1) for i in range(args.num_layers)])
        # Final layer norm before output.
        self.final_layernorm = LayerNorm(
-            hyperparameters['hidden_size'],
+            args.hidden_size,
-            eps=hyperparameters['layernorm_epsilon'])
+            eps=args.layernorm_epsilon)
    def _checkpointed_forward(self, hidden_states, attention_mask):

--- a/megatron/module.py
+++ b/megatron/module.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Megatron Module"""
 import torch

--- a/megatron/tokenizer/__init__.py
+++ b/megatron/tokenizer/__init__.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .tokenizer import build_tokenizer
--- a/megatron/data/bert_tokenization.py
+++ b/megatron/data/bert_tokenization.py
--- a/megatron/tokenizer/gpt2_tokenization.py
+++ b/megatron/tokenizer/gpt2_tokenization.py
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+logger = logging.getLogger(__name__)
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            from .file_utils import cached_path
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
+        return tokenizer
+    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file, merge_file, special_tokens_file
--- a/megatron/data/tokenizer.py
+++ b/megatron/data/tokenizer.py
+# coding=utf-8
-"""Megatron tokenizer."""
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron tokenizers."""
 from abc import ABC
 from abc import abstractmethod
-from megatron.utils import vocab_size_with_padding
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .gpt2_tokenization import GPT2Tokenizer
-def add_tokenizer_to_args(args, tokenizer_type):
+def build_tokenizer(args):
-    """Instantiate tokenizer based on input type and add it to args."""
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type),
+              flush=True)
-    # Make sure we have not already called this method.
-    if hasattr(args, 'tokenizer'):
-        raise Exception('args already has a tokenizer')
    # Select and instantiate the tokenizer.
-    if tokenizer_type == 'BertWordPieceLowerCase':
+    assert args.vocab_file is not None
-        args.tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab,
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
-                                                 lower_case=True)
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                                    lower_case=True)
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
    else:
        raise NotImplementedError('{} tokenizer is not '
-                                  'implemented.'.format(tokenizer_type))
+                                  'implemented.'.format(args.tokenizer_type))
    # Add vocab size.
-    args.vocab_size = vocab_size_with_padding(args.tokenizer.vocab_size, args)
+    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
+                                                      args)
+    return tokenizer
+def _vocab_size_with_padding(orig_vocab_size, args):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * \
+               args.model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    if args.rank == 0:
+        print(' > padded vocab (size: {}) with {} dummy tokens '
+              '(new size: {})'.format(
+                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+    return after
 class AbstractTokenizer(ABC):
@@ -39,6 +75,18 @@ class AbstractTokenizer(ABC):
    def vocab_size(self):
        pass
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
    @abstractmethod
    def tokenize(self, text):
        pass
@@ -63,6 +111,10 @@ class AbstractTokenizer(ABC):
        raise NotImplementedError('EOD is not provided for {} '
                                  'tokenizer'.format(self.name))
+    @property
+    def mask(self):
+        raise NotImplementedError('MASK is not provided for {} '
+                                  'tokenizer'.format(self.name))
 class _BertWordPieceTokenizer(AbstractTokenizer):
@@ -78,11 +130,20 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
        self.cls_id = self.tokenizer.vocab['[CLS]']
        self.sep_id = self.tokenizer.vocab['[SEP]']
        self.pad_id = self.tokenizer.vocab['[PAD]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']  
    @property
    def vocab_size(self):
        return self.tokenizer.vocab_size()
+    @property
+    def vocab(self):
+        return self.tokenizer.vocab
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.inv_vocab
    def tokenize(self, text):
        text_tokens = self.tokenizer.tokenize(text)
        return self.tokenizer.convert_tokens_to_ids(text_tokens)
@@ -98,3 +159,37 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
    @property
    def pad(self):
        return self.pad_id
+    @property
+    def mask(self):
+        return self.mask_id
+class _GPT2BPETokenizer(AbstractTokenizer):
+    """Original GPT2 BPE tokenizer."""
+    def __init__(self, vocab_file, merge_file):
+        name = 'GPT2 BPE'
+        super().__init__(name)
+        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
+                                       special_tokens=[], max_len=None)
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+    @property
+    def eod(self):
+        return self.eod_id
--- a/megatron/training.py
+++ b/megatron/training.py
--- a/megatron/utils.py
+++ b/megatron/utils.py
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -18,55 +18,42 @@
 import torch
 import torch.nn.functional as F
+from megatron import get_args
+from megatron import get_timers
 from megatron import mpu
+from megatron import print_rank_0
+from megatron.data.bert_dataset import build_train_valid_test_datasets
 from megatron.model import BertModel
-from megatron.utils import print_rank_0
+from megatron.training import pretrain
+from megatron.utils import make_data_loader
 from megatron.utils import reduce_losses
-from megatron.utils import vocab_size_with_padding
-from megatron.training import run
-from megatron.data.bert_dataset import build_train_valid_test_datasets
-from megatron.data_utils.samplers import DistributedBatchSampler
-def model_provider(args):
+def model_provider():
    """Build the model."""
+    args = get_args()
    print_rank_0('building BERT model ...')
    model = BertModel(
-        num_layers=args.num_layers,
+        num_tokentypes=2,
-        vocab_size=args.vocab_size,
-        hidden_size=args.hidden_size,
-        num_attention_heads=args.num_attention_heads,
-        embedding_dropout_prob=args.hidden_dropout,
-        attention_dropout_prob=args.attention_dropout,
-        output_dropout_prob=args.hidden_dropout,
-        max_sequence_length=args.max_position_embeddings,
-        checkpoint_activations=args.checkpoint_activations,
-        checkpoint_num_layers=args.checkpoint_num_layers,
        add_binary_head=True,
-        layernorm_epsilon=args.layernorm_epsilon,
+        parallel_output=True)
-        num_tokentypes=args.tokentype_size,
-        parallel_output=True,
-        apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
-        attention_softmax_in_fp32=args.attention_softmax_in_fp32)
    return model
-def get_batch(data_iterator, timers):
+def get_batch(data_iterator):
    # Items and their type.
    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask']
    datatype = torch.int64
    # Broadcast data.
-    timers('data loader').start()
    if data_iterator is not None:
        data = next(data_iterator)
    else:
        data = None
-    timers('data loader').stop()
    data_b = mpu.broadcast_data(keys, data, datatype)
    # Unpack.
@@ -80,13 +67,14 @@ def get_batch(data_iterator, timers):
    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask
-def forward_step(data_iterator, model, args, timers):
+def forward_step(data_iterator, model):
    """Forward step."""
+    timers = get_timers()
    # Get the batch.
    timers('batch generator').start()
    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \
-        = get_batch(data_iterator, timers)
+        = get_batch(data_iterator)
    timers('batch generator').stop()
    # Forward model.
@@ -108,8 +96,9 @@ def forward_step(data_iterator, model, args, timers):
    return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]}
-def get_train_val_test_data(args):
+def get_train_val_test_data():
    """Load the data on rank zero and boradcast number of tokens to all GPUS."""
+    args = get_args()
    (train_data, valid_data, test_data) = (None, None, None)
@@ -118,17 +107,6 @@ def get_train_val_test_data(args):
        print_rank_0('> building train, validation, and test datasets '
                     'for BERT ...')
-        if args.data_loader is None:
-            args.data_loader = 'binary'
-        if args.data_loader != 'binary':
-            print('Unsupported {} data loader for BERT.'.format(
-                args.data_loader))
-            exit(1)
-        if not args.data_path:
-            print('BERT only supports a unified dataset specified '
-                  'with --data-path')
-            exit(1)
        data_parallel_size = mpu.get_data_parallel_world_size()
        data_parallel_rank = mpu.get_data_parallel_rank()
        global_batch_size = args.batch_size * data_parallel_size
@@ -137,7 +115,7 @@ def get_train_val_test_data(args):
        train_iters = args.train_iters
        eval_iters = (train_iters // args.eval_interval + 1) * args.eval_iters
        test_iters = args.eval_iters
-        train_val_test_num_samples = [args.train_iters * global_batch_size,
+        train_val_test_num_samples = [train_iters * global_batch_size,
                                      eval_iters * global_batch_size,
                                      test_iters * global_batch_size]
        print_rank_0(' > datasets target sizes (minimum size):')
@@ -145,10 +123,8 @@ def get_train_val_test_data(args):
        print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
        print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
-        assert len(args.data_path) == 1
        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-            vocab_file=args.vocab,
+            data_prefix=args.data_path,
-            data_prefix=args.data_path[0],
            data_impl=args.data_impl,
            splits_string=args.split,
            train_valid_test_num_samples=train_val_test_num_samples,
@@ -156,57 +132,34 @@ def get_train_val_test_data(args):
            masked_lm_prob=args.mask_prob,
            short_seq_prob=args.short_seq_prob,
            seed=args.seed,
-            skip_warmup=args.skip_mmap_warmup)
+            skip_warmup=(not args.mmap_warmup))
        print_rank_0("> finished creating BERT datasets ...")
-        def make_data_loader_(dataset):
+        train_data = make_data_loader(train_ds)
-            if not dataset:
+        valid_data = make_data_loader(valid_ds)
-                return None
+        test_data = make_data_loader(test_ds)
-            # Use a simple sampler with distributed batch sampler.
-            sampler = torch.utils.data.SequentialSampler(dataset)
-            batch_sampler = DistributedBatchSampler(
-                sampler=sampler,
-                batch_size=global_batch_size,
-                drop_last=True,
-                rank=data_parallel_rank,
-                world_size=data_parallel_size)
-            # Torch dataloader.
-            return torch.utils.data.DataLoader(dataset,
-                                               batch_sampler=batch_sampler,
-                                               num_workers=args.num_workers,
-                                               pin_memory=True)
-        train_data = make_data_loader_(train_ds)
-        valid_data = make_data_loader_(valid_ds)
-        test_data = make_data_loader_(test_ds)
        do_train = train_data is not None and args.train_iters > 0
        do_valid = valid_data is not None and args.eval_iters > 0
        do_test = test_data is not None and args.eval_iters > 0
        # Need to broadcast num_tokens and num_type_tokens.
-        num_tokens = vocab_size_with_padding(train_ds.num_tokens(), args)
+        flags = torch.cuda.LongTensor(
-        token_counts = torch.cuda.LongTensor([num_tokens,
+            [int(do_train), int(do_valid), int(do_test)])
-                                              2, # hard coded num_type_tokens
-                                              int(do_train),
-                                              int(do_valid),
-                                              int(do_test)])
    else:
-        token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])
+        flags = torch.cuda.LongTensor([0, 0, 0])
    # Broadcast num tokens.
-    torch.distributed.broadcast(token_counts,
+    torch.distributed.broadcast(flags,
                                mpu.get_model_parallel_src_rank(),
                                group=mpu.get_model_parallel_group())
-    args.vocab_size = token_counts[0].item()
+    args.do_train = flags[0].item()
-    args.tokentype_size = token_counts[1].item()
+    args.do_valid = flags[1].item()
-    args.do_train = token_counts[2].item()
+    args.do_test = flags[2].item()
-    args.do_valid = token_counts[3].item()
-    args.do_test = token_counts[4].item()
    return train_data, valid_data, test_data
 if __name__ == "__main__":
-    run('Pretrain BERT model', get_train_val_test_data,
+    pretrain(get_train_val_test_data, model_provider, forward_step,
-        model_provider, forward_step)
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
@@ -15,55 +15,46 @@
 """Pretrain GPT2"""
+import os
 import torch
-from configure_data import configure_data
+from megatron import get_args
-from gpt2_data_loader import make_gpt2_dataloaders
+from megatron import get_timers
+from megatron import get_tokenizer
 from megatron import mpu
+from megatron import print_rank_0
+from megatron.data.gpt2_dataset import GPT2Dataset
 from megatron.model import GPT2Model
+from megatron.training import pretrain
 from megatron.utils import get_ltor_masks_and_position_ids
-from megatron.utils import print_rank_0
+from megatron.utils import make_data_loader
 from megatron.utils import reduce_losses
-from megatron.utils import vocab_size_with_padding
-from megatron.training import run
-def model_provider(args):
+def model_provider():
    """Build the model."""
+    args = get_args()
    print_rank_0('building GPT2 model ...')
-    model = GPT2Model(num_layers=args.num_layers,
+    model = GPT2Model(num_tokentypes=0, parallel_output=True)
-                      vocab_size=args.vocab_size,
-                      hidden_size=args.hidden_size,
-                      num_attention_heads=args.num_attention_heads,
-                      embedding_dropout_prob=args.hidden_dropout,
-                      attention_dropout_prob=args.attention_dropout,
-                      output_dropout_prob=args.hidden_dropout,
-                      max_sequence_length=args.max_position_embeddings,
-                      checkpoint_activations=args.checkpoint_activations,
-                      checkpoint_num_layers=args.checkpoint_num_layers,
-                      layernorm_epsilon=args.layernorm_epsilon,
-                      parallel_output=True,
-                      apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
-                      attention_softmax_in_fp32=args.attention_softmax_in_fp32)
    return model
-def get_batch(data_iterator, args, timers):
+def get_batch(data_iterator):
    """Generate a batch"""
+    args = get_args()
    # Items and their type.
    keys = ['text']
    datatype = torch.int64
    # Broadcast data.
-    timers('data loader').start()
    if data_iterator is not None:
        data = next(data_iterator)
    else:
        data = None
-    timers('data loader').stop()
    data_b = mpu.broadcast_data(keys, data, datatype)
    # Unpack.
@@ -85,13 +76,14 @@ def get_batch(data_iterator, args, timers):
    return tokens, labels, loss_mask, attention_mask, position_ids
-def forward_step(data_iterator, model, args, timers):
+def forward_step(data_iterator, model):
    """Forward step."""
+    timers = get_timers()
    # Get the batch.
    timers('batch generator').start()
    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        data_iterator, args, timers)
+        data_iterator)
    timers('batch generator').stop()
    # Forward model.
@@ -107,60 +99,74 @@ def forward_step(data_iterator, model, args, timers):
    return loss, {'lm loss': reduced_loss[0]}
-def get_train_val_test_data(args):
+def make_gpt2_dataloaders():
+    """Build gpt2 dataloders."""
+    args = get_args()
+    # Input parameters.
+    input_data_sizes_file = args.input_data_sizes_file
+    seq_length = args.seq_length
+    initial_seed = args.seed
+    # Build the datasets.
+    def _build_dataset(name):
+        return GPT2Dataset(os.path.join(args.data_path, name),
+                           args.input_data_sizes_file,
+                           args.seq_length, args.seed)
+    train_ds = _build_dataset('train')
+    valid_ds = _build_dataset('valid')
+    test_ds = _build_dataset('test')
+    # Dataloaders
+    train = make_data_loader(train_ds)
+    valid = make_data_loader(valid_ds)
+    test = make_data_loader(test_ds)
+    args.do_train = False
+    args.do_valid = False
+    args.do_test = False
+    if train is not None:
+        args.do_train = True
+    if valid is not None:
+        args.do_valid = True
+    if test is not None:
+        args.do_test = True
+    return (train, valid, test)
+def get_train_val_test_data():
    """Load the data on rank zero and boradcast number of tokens to all GPUS."""
+    args = get_args()
    (train_data, val_data, test_data) = (None, None, None)
    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
-        if args.data_loader == 'numpy':
-            assert len(args.train_data) == 1
+        (train_data, val_data, test_data) = make_gpt2_dataloaders()
-            args.train_data = args.train_data[0]
+        flags = torch.cuda.LongTensor([int(args.do_train),
-            assert len(args.valid_data) == 1
+                                       int(args.do_valid),
-            args.valid_data = args.valid_data[0]
+                                       int(args.do_test)])
-            assert len(args.test_data) == 1
-            args.test_data = args.test_data[0]
-            (train_data, val_data, test_data), num_tokens, \
-                eod_token = make_gpt2_dataloaders(args)
-        elif args.data_loader == 'raw' or args.data_loader == 'lazy':
-            data_config = configure_data()
-            data_config.set_defaults(data_set_type='GPT2', transpose=False)
-            (train_data, val_data, test_data), tokenizer = data_config.apply(
-                args)
-            num_tokens = tokenizer.num_tokens
-            eod_token = tokenizer.get_command('eos').Id
-            assert eod_token == tokenizer.get_command('pad').Id
-        else:
-            print("Unsupported data loader for GPT2.")
-            exit(1)
-        # pad.
-        num_tokens = vocab_size_with_padding(num_tokens, args)
-        print_rank_0('> found end-of-document token: {}'.format(eod_token))
-        token_counts = torch.cuda.LongTensor([num_tokens, eod_token,
-                                              int(args.do_train),
-                                              int(args.do_valid),
-                                              int(args.do_test)])
    else:
-        token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])
+        flags = torch.cuda.LongTensor([0, 0, 0])
    # Broadcast num tokens.
-    torch.distributed.broadcast(token_counts,
+    torch.distributed.broadcast(flags,
                                mpu.get_model_parallel_src_rank(),
                                group=mpu.get_model_parallel_group())
-    num_tokens = token_counts[0].item()
+    args.do_train = flags[0].item()
-    eod_token = token_counts[1].item()
+    args.do_valid = flags[1].item()
-    args.do_train = token_counts[2].item()
+    args.do_test = flags[2].item()
-    args.do_valid = token_counts[3].item()
-    args.do_test = token_counts[4].item()
-    args.vocab_size = num_tokens
+    tokenizer = get_tokenizer()
-    args.eod_token = eod_token
+    args.eod_token = tokenizer.eod_id
    return train_data, val_data, test_data
 if __name__ == "__main__":
-    run('Pretrain GPT-2 model', get_train_val_test_data,
+    pretrain(get_train_val_test_data, model_provider, forward_step,
-        model_provider, forward_step)
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
--- a/scripts/linter.py
+++ b/scripts/linter.py
+import os
+import os.path as osp
+import pathlib
+import subprocess
+def recursively_lint_files():
+    """Recursively lint all python files in chosen subdirectories of megatron-lm"""
+    try:
+        import autopep8
+    except ModuleNotFoundError:
+        print("Please first install autopep8 via `pip install autopep8`")
+        return
+    # get all python file paths from top level directory
+    file_dir = str(pathlib.Path(__file__).parent.absolute())
+    working_dir = osp.join(file_dir, os.pardir)
+    all_py_paths = set(os.path.join(working_dir, fname)
+                       for fname in os.listdir(working_dir) if ".py" in fname)
+    # get all python file paths from chosen subdirectories
+    check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
+    for sub_dir in check_dirs:
+        for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
+            all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
+    print("Linting the following: ")
+    for py_path in all_py_paths:
+        print(py_path)
+        command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
+        subprocess.check_call(command)
+if __name__ == "__main__":
+    recursively_lint_files()
--- a/tasks/eval_utils.py
+++ b/tasks/eval_utils.py
@@ -20,26 +20,28 @@ import time
 import torch
+from megatron import get_args
 from megatron import mpu
-from megatron.utils import print_rank_0
+from megatron import print_rank_0
-from .finetune_utils import build_data_loader
+from tasks.finetune_utils import build_data_loader
-from .finetune_utils import process_batch
+from tasks.finetune_utils import process_batch
-def accuracy_func_provider(args, single_dataset_provider):
+def accuracy_func_provider(single_dataset_provider):
    """Provide function that calculates accuracies."""
+    args = get_args()
    # Build dataloaders.
    datapaths = args.valid_data
    dataloaders = []
    for datapath in datapaths:
-        dataset = single_dataset_provider(datapath, args)
+        dataset = single_dataset_provider(datapath)
        dataloader = build_data_loader(
            dataset, args.batch_size, num_workers=args.num_workers,
            drop_last=(mpu.get_data_parallel_world_size() > 1))
        dataloaders.append((dataset.dataset_name, dataloader))
-    def metrics_func(model, args_, epoch, output_predictions=False):
+    def metrics_func(model, epoch, output_predictions=False):
        print_rank_0('calculating metrics ...')
        correct = 0
        total = 0
@@ -48,7 +50,7 @@ def accuracy_func_provider(args, single_dataset_provider):
            named_predictions = []
            names = 'predictions'
        for name, dataloader in dataloaders:
-            output = calculate_correct_answers(name, model, dataloader, args_,
+            output = calculate_correct_answers(name, model, dataloader,
                                               epoch, output_predictions)
            if not output_predictions:
                correct_ans, total_count = output
@@ -70,7 +72,7 @@ def accuracy_func_provider(args, single_dataset_provider):
    return metrics_func
-def calculate_correct_answers(name, model, dataloader, args,
+def calculate_correct_answers(name, model, dataloader,
                              epoch, output_predictions):
    """Calculate correct over total answers and return prediction if the
    `output_predictions` is true."""
@@ -89,7 +91,7 @@ def calculate_correct_answers(name, model, dataloader, args,
            ids = []
        for _, batch in enumerate(dataloader):
            # Run the model forward.
-            tokens, types, labels_, attention_mask = process_batch(batch, args)
+            tokens, types, labels_, attention_mask = process_batch(batch)
            logits = model(tokens, attention_mask, types)
            # Add output predictions.
            if output_predictions: