Merge branch 'mo_albert_data_loader' into 'master'

New data loader See merge request ADLR/megatron-lm!16

Merge branch 'mo_albert_data_loader' into 'master'
New data loader See merge request ADLR/megatron-lm!16
2d76d065 · Jared Casper · 1b290993 · f86bb671 · 2d76d065 · 2d76d065
Commit 2d76d065 authored Jan 06, 2020 by Jared Casper
20 changed files
--- a/arguments.py
+++ b/arguments.py
@@ -267,23 +267,52 @@ def add_data_args(parser):
    group.add_argument('--shuffle', action='store_true',
                       help='Shuffle data. Shuffling is deterministic '
                       'based on seed and current epoch.')
+    group.add_argument('--data-loader', type=str, default=None,
+                       choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'],
+                       help='Which data loader to use. Default varies by model.')
    group.add_argument('--train-data', nargs='+', default=None,
-                       help='Whitespace separated filenames or corpora names '
+                       help='Whitespace separated paths or corpora names '
                       'for training.')
+    group.add_argument('--valid-data', nargs='*', default=None,
+                       help='path(s) to the validation data.')
+    group.add_argument('--test-data', nargs='*', default=None,
+                       help='path(s) to the testing data.')
+    group.add_argument('--data-path', nargs='+', default=None,
+                       help='path to combined dataset to split')
+    group.add_argument('--split', default='1000,1,1',
+                       help='comma-separated list of proportions for training,'
+                       ' validation, and test split')
-    group.add_argument('--use-npy-data-loader', action='store_true',
+    group.add_argument('--seq-length', type=int, default=512,
-                       help='Use the numpy data loader. If set, then'
+                       help="Maximum sequence length to process")
-                       'train-data-path, val-data-path, and test-data-path'
+    group.add_argument('--max-preds-per-seq', type=int, default=None,
-                       'should also be provided.')
+                       help='Maximum number of predictions to use per sequence.'
-    group.add_argument('--train-data-path', type=str, default='',
+                       'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
-                       help='path to the training data')
+                       'MUST BE SPECIFIED IF `--data-loader tfrecords`.')
-    group.add_argument('--val-data-path', type=str, default='',
-                       help='path to the validation data')
+    # arguments for binary data loader
-    group.add_argument('--test-data-path', type=str, default='',
+    parser.add_argument('--vocab', type=str, default='vocab.txt',
-                       help='path to the test data')
+                        help='path to vocab file')
+    parser.add_argument('--data-impl', type=str, default='infer',
+                        help='implementation of indexed datasets',
+                        choices=['lazy', 'cached', 'mmap', 'infer'])
+    parser.add_argument('--max-num-samples', type=int, default=None,
+                        help='Maximum number of samples to plan for, defaults to total iters * batch-size.')
+    parser.add_argument('--data-epochs', type=int, default=None,
+                        help='Number of epochs to plan for, defaults to using --max-num-samples')
+    parser.add_argument('--mask-prob', default=0.15, type=float,
+                        help='probability of replacing a token with mask')
+    parser.add_argument('--short-seq-prob', default=0.1, type=float,
+                        help='probability of producing a short sequence')
+    parser.add_argument('--skip-mmap-warmup', action='store_true',
+                        help='skip warming up mmap files')
+    # arguments for numpy data loader
    group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
-                       help='the filename containing all the shards sizes')
+                       help='the filename containing all the shards sizes for numpy data loader')
+    # arguments for raw/tfrecords data loader
    group.add_argument('--delim', default=',',
                       help='delimiter used to parse csv data files')
    group.add_argument('--text-key', default='sentence',
@@ -291,16 +320,6 @@ def add_data_args(parser):
    group.add_argument('--eval-text-key', default=None,
                       help='key to use to extract text from '
                       'json/csv evaluation datasets')
-    group.add_argument('--valid-data', nargs='*', default=None,
-                       help="""Filename for validation data.""")
-    group.add_argument('--split', default='1000,1,1',
-                       help='comma-separated list of proportions for training,'
-                       ' validation, and test split')
-    group.add_argument('--test-data', nargs='*', default=None,
-                       help="""Filename for testing""")
-    group.add_argument('--lazy-loader', action='store_true',
-                       help='whether to lazy read the data set')
    group.add_argument('--loose-json', action='store_true',
                       help='Use loose json (one json-formatted string per '
                       'newline), instead of tight json (data file is one '
@@ -308,6 +327,7 @@ def add_data_args(parser):
    group.add_argument('--presplit-sentences', action='store_true',
                       help='Dataset content consists of documents where '
                       'each document consists of newline separated sentences')
    group.add_argument('--num-workers', type=int, default=2,
                       help="""Number of workers to use for dataloading""")
    group.add_argument('--tokenizer-model-type', type=str,
@@ -328,16 +348,6 @@ def add_data_args(parser):
                       help='what type of tokenizer to use')
    group.add_argument("--cache-dir", default=None, type=str,
                       help="Where to store pre-trained BERT downloads")
-    group.add_argument('--use-tfrecords', action='store_true',
-                       help='load `--train-data`, `--valid-data`, '
-                       '`--test-data` from BERT tf records instead of '
-                       'normal data pipeline')
-    group.add_argument('--seq-length', type=int, default=512,
-                       help="Maximum sequence length to process")
-    group.add_argument('--max-preds-per-seq', type=int, default=None,
-                       help='Maximum number of predictions to use per sequence.'
-                       'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
-                       'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
    return parser
@@ -355,7 +365,7 @@ def get_args():
    args = parser.parse_args()
-    if not args.train_data and not args.train_data_path:
+    if not args.train_data and not args.data_path:
        print('WARNING: No training data specified')
    args.cuda = torch.cuda.is_available()

--- a/configure_data.py
+++ b/configure_data.py
@@ -116,7 +116,7 @@ def make_tfrecord_loaders(args):
 def make_loaders(args):
    """makes training/val/test"""
-    if args.use_tfrecords:
+    if args.data_loader == 'tfrecords':
        return make_tfrecord_loaders(args)
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
@@ -131,10 +131,12 @@ def make_loaders(args):
    if eval_seq_length is not None and eval_seq_length < 0:
        eval_seq_length = eval_seq_length * world_size
    split = get_split(args)
+    if args.data_path is not None:
+        args.train_data = args.data_path
    data_set_args = {
        'path': args.train_data,
        'seq_length': seq_length,
-        'lazy': args.lazy_loader,
+        'lazy': args.data_loader == 'lazy',
        'delim': args.delim,
        'text_key': args.text_key,
        'label_key': 'label',

--- a/gpt2_data_loader.py
+++ b/gpt2_data_loader.py
@@ -56,9 +56,9 @@ def make_gpt2_dataloaders(args):
                                           num_workers=num_workers,
                                           pin_memory=True)
-    train = make_data_loader_(args.train_data_path)
+    train = make_data_loader_(args.train_data)
-    valid = make_data_loader_(args.val_data_path)
+    valid = make_data_loader_(args.valid_data)
-    test = make_data_loader_(args.test_data_path)
+    test = make_data_loader_(args.test_data)
    args.do_train = False
    args.do_valid = False

--- a/megatron/data/Makefile
+++ b/megatron/data/Makefile
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+default: $(LIBNAME)$(LIBEXT)
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
--- a/megatron/data/__init__.py
+++ b/megatron/data/__init__.py
+from . import indexed_dataset
+from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .albert_dataset import AlbertDataset
--- a/megatron/data/albert_dataset.py
+++ b/megatron/data/albert_dataset.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ALBERT Style dataset."""
+import os
+import time
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from megatron import mpu
+from megatron.data import helpers
+from megatron.data import FullBertTokenizer
+from megatron.data.dataset_utils import build_training_sample
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+from megatron.utils import print_rank_0
+def build_train_valid_test_datasets(vocab_file, data_prefix, data_impl,
+                                    splits_string, train_valid_test_num_samples,
+                                    max_seq_length, masked_lm_prob,
+                                    short_seq_prob, seed, skip_warmup):
+    # Tokenizer is the same
+    tokenizer = FullBertTokenizer(vocab_file, do_lower_case=True)
+    print_rank_0(' > using full BERT tokenizer with vocabulary size: {}'.format(
+        tokenizer.vocab_size()))
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    # Get start and end indices of train/valid/train into doc-idx
+    # Note that doc-idx is desinged to be num-docs + 1 so we can
+    # easily iterate over it.
+    total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+        start_index = indexed_dataset.doc_idx[splits[index]]
+        end_index = indexed_dataset.doc_idx[splits[index + 1]]
+        print_rank_0('     sentence indices in [{}, {}) total of {} '
+                     'sentences'.format(start_index, end_index,
+                                        end_index - start_index))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            # Get the pointer to the original doc-idx so we can set it later.
+            doc_idx_ptr = indexed_dataset.get_doc_idx()
+            # Slice the doc-idx
+            start_index = splits[index]
+            # Add +1 so we can index into the dataset to get the upper bound.
+            end_index = splits[index + 1] + 1
+            # New doc_idx view.
+            indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
+            # Build the dataset accordingly.
+            dataset = AlbertDataset(
+                name=name,
+                indexed_dataset=indexed_dataset,
+                tokenizer=tokenizer,
+                data_prefix=data_prefix,
+                num_epochs=None,
+                max_num_samples=train_valid_test_num_samples[index],
+                masked_lm_prob=masked_lm_prob,
+                max_seq_length=max_seq_length,
+                short_seq_prob=short_seq_prob,
+                seed=seed)
+            # Set the original pointer so dataset remains the main dataset.
+            indexed_dataset.set_doc_idx(doc_idx_ptr)
+            # Checks.
+            assert indexed_dataset.doc_idx[0] == 0
+            assert indexed_dataset.doc_idx.shape[0] == \
+                (total_num_of_documents + 1)
+        return dataset
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+    return (train_dataset, valid_dataset, test_dataset)
+class AlbertDataset(Dataset):
+    def __init__(self, name, indexed_dataset, tokenizer, data_prefix,
+                 num_epochs, max_num_samples, masked_lm_prob,
+                 max_seq_length, short_seq_prob, seed):
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+        # Tokenizer and dataset.
+        self.tokenizer = tokenizer
+        self.indexed_dataset = indexed_dataset
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping_(self.indexed_dataset,
+                                                    data_prefix,
+                                                    num_epochs,
+                                                    max_num_samples,
+                                                    self.max_seq_length,
+                                                    short_seq_prob,
+                                                    self.seed,
+                                                    self.name)
+        # Vocab stuff.
+        self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = self.tokenizer.inv_vocab
+        self.cls_id = self.tokenizer.vocab['[CLS]']
+        self.sep_id = self.tokenizer.vocab['[SEP]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']
+        self.pad_id = self.tokenizer.vocab['[PAD]']
+    def num_tokens(self):
+        return self.tokenizer.vocab_size()
+    def __len__(self):
+        return self.samples_mapping.shape[0]
+    def __getitem__(self, idx):
+        start_index, end_index, seq_length = self.samples_mapping[idx]
+        sample = []
+        for index in range(start_index, end_index):
+            sample.append(self.indexed_dataset[index])
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length, # needed for padding
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_id, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.masked_lm_prob, np_rng)
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+    print_rank_0(' > building dataset index ...')
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+    print_rank_0(' > indexed dataset stats:')
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.doc_idx.shape[0] - 1))
+    print_rank_0('    number of sentences: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+    return indexed_dataset
+def get_train_valid_test_split_(splits_string, size):
+    """ Get dataset splits from comma or '/' separated string list."""
+    splits = []
+    if splits_string.find(',') != -1:
+        splits = [float(s) for s in splits_string.split(',')]
+    elif splits_string.find('/') != -1:
+        splits = [float(s) for s in splits_string.split('/')]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split/splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] +
+                            int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+def get_samples_mapping_(indexed_dataset,
+                         data_prefix,
+                         num_epochs,
+                         max_num_samples,
+                         max_seq_length,
+                         short_seq_prob,
+                         seed,
+                         name):
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
+    indexmap_filename += '_{}s'.format(seed)
+    indexmap_filename += '.npy'
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0 and \
+       not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+        # Make sure the types match the helpers input types.
+        assert indexed_dataset.doc_idx.dtype == np.int64
+        assert indexed_dataset.sizes.dtype == np.int32
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building sapmles index mapping for {} ...'.format(
+            name))
+        samples_mapping = helpers.build_mapping(
+            indexed_dataset.doc_idx,
+            indexed_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length-3, # account for added tokens
+            short_seq_prob,
+            seed,
+            verbose)
+        print_rank_0(' > done building sapmles index maping')
+        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elasped time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+                         time.time() - start_time))
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+    samples_mapping = np.load(indexmap_filename, allow_pickle=True)
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        samples_mapping.shape[0]))
+    return samples_mapping
--- a/megatron/data/bert_tokenization.py
+++ b/megatron/data/bert_tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import re
+import unicodedata
+import six
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+  if not init_checkpoint:
+    return
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+  model_name = m.group(1)
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with open(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+    return split_tokens
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+  def vocab_size(self):
+    return len(self.vocab)
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+  def __init__(self, do_lower_case=True):
+    """Constructs a BasicTokenizer.
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+    self.do_lower_case = do_lower_case
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+    return ["".join(x) for x in output]
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+    return False
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+    Returns:
+      A list of wordpiece tokens.
+    """
+    text = convert_to_unicode(text)
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically contorl characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import numpy as np
+def build_training_sample(sample,
+                          target_seq_length, max_seq_length,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_id, sep_id, mask_id, pad_id,
+                          masked_lm_prob, np_rng):
+    """Biuld training sample.
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_id: Start of example id.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        masked_lm_prob: Probability to mask tokens.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+    """
+    # We assume that we have at least two sentences in the sample
+    assert len(sample) > 1
+    assert target_seq_length <= max_seq_length
+    # Divide sample into two segments (A and B).
+    tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, np_rng)
+    # Truncate to `target_sequence_length`.
+    max_num_tokens = target_seq_length
+    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
+                                  len(tokens_b), max_num_tokens, np_rng)
+    # Build tokens and toketypes.
+    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
+                                                      cls_id, sep_id)
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * max_num_tokens
+    (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
+    # Padding.
+    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
+        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                                   masked_labels, pad_id, max_seq_length)
+    train_sample = {
+        'text': tokens_np,
+        'types': tokentypes_np,
+        'labels': labels_np,
+        'is_random': int(is_next_random),
+        'loss_mask': loss_mask_np,
+        'padding_mask': padding_mask_np,
+        'truncated': int(truncated)}
+    return train_sample
+def get_a_and_b_segments(sample, np_rng):
+    """Divide sample into a and b segments."""
+    # Number of sentences in the sample.
+    n_sentences = len(sample)
+    # Make sure we always have two sentences.
+    assert n_sentences > 1, 'make sure each sample has at least two sentences.'
+    # First part:
+    # `a_end` is how many sentences go into the `A`.
+    a_end = 1
+    if n_sentences >= 3:
+        # Note that randin in numpy is exclusive.
+        a_end = np_rng.randint(1, n_sentences)
+    tokens_a = []
+    for j in range(a_end):
+        tokens_a.extend(sample[j])
+    # Second part:
+    tokens_b = []
+    for j in range(a_end, n_sentences):
+        tokens_b.extend(sample[j])
+    # Random next:
+    is_next_random = False
+    if np_rng.random() < 0.5:
+        is_next_random = True
+        tokens_a, tokens_b = tokens_b, tokens_a
+    return tokens_a, tokens_b, is_next_random
+def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    #print(len_a, len_b, max_num_tokens)
+    assert len_a > 0
+    assert len_b > 0
+    if len_a + len_b <= max_num_tokens:
+        return False
+    while len_a + len_b > max_num_tokens:
+        if len_a > len_b:
+            len_a -= 1
+            tokens = tokens_a
+        else:
+            len_b -= 1
+            tokens = tokens_b
+        if np_rng.random() < 0.5:
+            del tokens[0]
+        else:
+            tokens.pop()
+    return True
+def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
+    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
+    tokens = []
+    tokentypes = []
+    # [CLS].
+    tokens.append(cls_id)
+    tokentypes.append(0)
+    # Segment A.
+    for token in tokens_a:
+        tokens.append(token)
+        tokentypes.append(0)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(0)
+    # Segment B.
+    for token in tokens_b:
+        tokens.append(token)
+        tokentypes.append(1)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(1)
+    return tokens, tokentypes
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+def is_start_piece(piece):
+  """Check if the current word piece is the starting piece (BERT)."""
+  # When a word has been split into
+  # WordPieces, the first token does not have any marker and any subsequence
+  # tokens are prefixed with ##. So whenever we see the ## token, we
+  # append it to the previous set of word indexes.
+  return not piece.startswith("##")
+def create_masked_lm_predictions(tokens,
+                                 vocab_id_list, vocab_id_to_token_dict,
+                                 masked_lm_prob,
+                                 cls_id, sep_id, mask_id,
+                                 max_predictions_per_seq,
+                                 np_rng,
+                                 max_ngrams=3,
+                                 do_whole_word_mask=True,
+                                 favor_longer_ngram=False,
+                                 do_permutation=False):
+  """Creates the predictions for the masked LM objective.
+  Note: Tokens here are vocab ids and not text tokens."""
+  cand_indexes = []
+  # Note(mingdachen): We create a list for recording if the piece is
+  # the starting piece of current token, where 1 means true, so that
+  # on-the-fly whole word masking is possible.
+  token_boundary = [0] * len(tokens)
+  for (i, token) in enumerate(tokens):
+    if token == cls_id or token == sep_id:
+      token_boundary[i] = 1
+      continue
+    # Whole Word Masking means that if we mask all of the wordpieces
+    # corresponding to an original word.
+    #
+    # Note that Whole Word Masking does *not* change the training code
+    # at all -- we still predict each WordPiece independently, softmaxed
+    # over the entire vocabulary.
+    if (do_whole_word_mask and len(cand_indexes) >= 1 and
+        not is_start_piece(vocab_id_to_token_dict[token])):
+      cand_indexes[-1].append(i)
+    else:
+      cand_indexes.append([i])
+      if is_start_piece(vocab_id_to_token_dict[token]):
+        token_boundary[i] = 1
+  output_tokens = list(tokens)
+  masked_lm_positions = []
+  masked_lm_labels = []
+  if masked_lm_prob == 0:
+    return (output_tokens, masked_lm_positions,
+            masked_lm_labels, token_boundary)
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+  # Note(mingdachen):
+  # By default, we set the probilities to favor shorter ngram sequences.
+  ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
+  pvals = 1. / np.arange(1, max_ngrams + 1)
+  pvals /= pvals.sum(keepdims=True)
+  if favor_longer_ngram:
+    pvals = pvals[::-1]
+  ngram_indexes = []
+  for idx in range(len(cand_indexes)):
+    ngram_index = []
+    for n in ngrams:
+      ngram_index.append(cand_indexes[idx:idx+n])
+    ngram_indexes.append(ngram_index)
+  np_rng.shuffle(ngram_indexes)
+  masked_lms = []
+  covered_indexes = set()
+  for cand_index_set in ngram_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    if not cand_index_set:
+      continue
+    # Note(mingdachen):
+    # Skip current piece if they are covered in lm masking or previous ngrams.
+    for index_set in cand_index_set[0]:
+      for index in index_set:
+        if index in covered_indexes:
+          continue
+    n = np_rng.choice(ngrams[:len(cand_index_set)],
+                      p=pvals[:len(cand_index_set)] /
+                      pvals[:len(cand_index_set)].sum(keepdims=True))
+    index_set = sum(cand_index_set[n - 1], [])
+    n -= 1
+    # Note(mingdachen):
+    # Repeatedly looking for a candidate that does not exceed the
+    # maximum number of predictions by trying shorter ngrams.
+    while len(masked_lms) + len(index_set) > num_to_predict:
+      if n == 0:
+        break
+      index_set = sum(cand_index_set[n - 1], [])
+      n -= 1
+    # If adding a whole-word mask would exceed the maximum number of
+    # predictions, then just skip this candidate.
+    if len(masked_lms) + len(index_set) > num_to_predict:
+      continue
+    is_any_index_covered = False
+    for index in index_set:
+      if index in covered_indexes:
+        is_any_index_covered = True
+        break
+    if is_any_index_covered:
+      continue
+    for index in index_set:
+      covered_indexes.add(index)
+      masked_token = None
+      # 80% of the time, replace with [MASK]
+      if np_rng.random() < 0.8:
+        masked_token = mask_id
+      else:
+        # 10% of the time, keep original
+        if np_rng.random() < 0.5:
+          masked_token = tokens[index]
+        # 10% of the time, replace with random word
+        else:
+          masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
+      output_tokens[index] = masked_token
+      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+  assert len(masked_lms) <= num_to_predict
+  np_rng.shuffle(ngram_indexes)
+  select_indexes = set()
+  if do_permutation:
+    for cand_index_set in ngram_indexes:
+      if len(select_indexes) >= num_to_predict:
+        break
+      if not cand_index_set:
+        continue
+      # Note(mingdachen):
+      # Skip current piece if they are covered in lm masking or previous ngrams.
+      for index_set in cand_index_set[0]:
+        for index in index_set:
+          if index in covered_indexes or index in select_indexes:
+            continue
+      n = np.random.choice(ngrams[:len(cand_index_set)],
+                           p=pvals[:len(cand_index_set)] /
+                           pvals[:len(cand_index_set)].sum(keepdims=True))
+      index_set = sum(cand_index_set[n - 1], [])
+      n -= 1
+      while len(select_indexes) + len(index_set) > num_to_predict:
+        if n == 0:
+          break
+        index_set = sum(cand_index_set[n - 1], [])
+        n -= 1
+      # If adding a whole-word mask would exceed the maximum number of
+      # predictions, then just skip this candidate.
+      if len(select_indexes) + len(index_set) > num_to_predict:
+        continue
+      is_any_index_covered = False
+      for index in index_set:
+        if index in covered_indexes or index in select_indexes:
+          is_any_index_covered = True
+          break
+      if is_any_index_covered:
+        continue
+      for index in index_set:
+        select_indexes.add(index)
+    assert len(select_indexes) <= num_to_predict
+    select_indexes = sorted(select_indexes)
+    permute_indexes = list(select_indexes)
+    np_rng.shuffle(permute_indexes)
+    orig_token = list(output_tokens)
+    for src_i, tgt_i in zip(select_indexes, permute_indexes):
+      output_tokens[src_i] = orig_token[tgt_i]
+      masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i]))
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+  return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
+def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                             masked_labels, pad_id, max_seq_length):
+    """Pad sequences and convert them to numpy."""
+    # Some checks.
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+    assert len(tokentypes) == num_tokens
+    assert len(masked_positions) == len(masked_labels)
+    # Tokens and token types.
+    filler = [pad_id]*padding_length
+    tokens_np = np.array(tokens + filler, dtype=np.int64)
+    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+    # Padding mask.
+    padding_mask_np = np.array([1]*num_tokens + [0]*padding_length,
+                               dtype=np.int64)
+    # Lables and loss mask.
+    labels = [-1] * max_seq_length
+    loss_mask = [0] * max_seq_length
+    for i in range(len(masked_positions)):
+        assert masked_positions[i] < num_tokens
+        labels[masked_positions[i]] = masked_labels[i]
+        loss_mask[masked_positions[i]] = 1
+    labels_np = np.array(labels, dtype=np.int64)
+    loss_mask_np = np.array(loss_mask, dtype=np.int64)
+    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
+/*
+ coding=utf-8
+ Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+/* Helper methods for fast index mapping builds */
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <random>
+namespace py = pybind11;
+using namespace std;
+const int32_t LONG_SENTENCE_LEN = 512;
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+				     const int32_t max_length,
+				     std::mt19937& rand32_gen) {
+    /* Training sample length. */
+    const auto random_number = rand32_gen();
+    if ((random_number % short_seq_ratio) == 0) {
+      return 2 + random_number % (max_length - 1);
+    }
+    return max_length;
+}
+template<typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
+                             const py::array_t<int32_t>& sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+			     const bool verbose) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(short_seq_prob > 0.0);
+    assert(short_seq_prob <= 1.0);
+    assert(seed > 0);
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+    // For efficiency, convert probability to ratio. Note: rand() generates int.
+    const auto short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+	const auto sent_end_index = docs[docs_.shape(0) - 1];
+	const auto num_sentences = sent_end_index - sent_start_index;
+	cout << "    using:" << endl << std::flush;
+	cout << "     number of documents:            " << docs_.shape(0) - 1 <<
+	  endl << std::flush;
+	cout << "     sentences range:                [" << sent_start_index <<
+	", " << sent_end_index << ")" << endl << std::flush;
+	cout << "     total number of sentences:      " << num_sentences <<
+	  endl << std::flush;
+	cout << "     number of epochs:               " << num_epochs <<
+	  endl << std::flush;
+	cout << "     maximum number of samples:      " << max_num_samples <<
+	  endl << std::flush;
+	cout << "     maximum sequence length:        " << max_seq_length <<
+	  endl << std::flush;
+	cout << "     short sequence probability:     " << short_seq_prob <<
+	endl << std::flush;
+	cout << "     short sequence ration (1/prob): " << short_seq_ratio <<
+	  endl << std::flush;
+	cout << "     seed:                           " << seed << endl <<
+	  std::flush;
+    }
+    // Mapping and it's length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration=0; iteration<2; ++iteration) {
+        // Set the seed so both iterations produce the same results.
+        std::mt19937 rand32_gen(seed);
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+        // Counters:
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+	uint64_t long_sent_docs = 0;
+        // Current map index.
+        uint64_t map_index = 0;
+        // For each epoch:
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            if (map_index >= max_num_samples) {
+	        if (verbose && (!second)) {
+		  cout << "    reached " << max_num_samples << " samples after "
+		       << epoch << " epochs ..." << endl << std::flush;
+		}
+                break;
+            }
+            // For each document:
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+                // At the begining of the document previous index is the
+		// start index.
+                auto prev_start_index = sent_index_first;
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) {
+		        ++empty_docs;
+                    }
+                    if (num_remain_sent == 1) {
+		        ++one_sent_docs;
+                    }
+                }
+		// Detect documents with long sentences.
+		bool contains_long_sentence = false;
+		if (num_remain_sent > 1) {
+		    for (auto sent_index=sent_index_first;
+			 sent_index < sent_index_last; ++sent_index) {
+		        if (sizes[sent_index] > LONG_SENTENCE_LEN){
+			    if ((epoch == 0) && (!second)) {
+			        ++long_sent_docs;
+			    }
+			    contains_long_sentence = true;
+			    break;
+			}
+		    }
+		}
+                // If we have more than two sentences.
+                if ((num_remain_sent > 1) && (!contains_long_sentence)) {
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+                    auto target_seq_len = get_target_sample_len(short_seq_ratio,
+								max_seq_length,
+								rand32_gen);
+                    // Loop through sentences.
+                    for (auto sent_index=sent_index_first;
+                         sent_index < sent_index_last; ++sent_index) {
+		        // Add the size and number of sentences.
+		        seq_len += sizes[sent_index];
+		        ++num_sent;
+			--num_remain_sent;
+			// If we have reached the target length.
+			// and if not only one sentence is left in the document.
+			// and if we have at least two sentneces.
+			// and if we have reached end of the document.
+			if (((seq_len >= target_seq_len) &&
+			     (num_remain_sent > 1) &&
+			     (num_sent > 1) ) || (num_remain_sent == 0)) {
+			    // Check for overflow.
+			    if ((3 * map_index + 2) >
+				std::numeric_limits<int64_t>::max()) {
+			        cout << "number of samples exceeded maximum "
+				     << "allowed by type int64: "
+				     << std::numeric_limits<int64_t>::max()
+				     << endl;
+				throw std::overflow_error("Number of samples");
+			    }
+			    // Populate the map.
+			    if (second) {
+			        const auto map_index_0 = 3 * map_index;
+				maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+				maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+				maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+			    }
+			    // Update indices / counters.
+			    ++map_index;
+			    prev_start_index = sent_index + 1;
+			    target_seq_len = get_target_sample_len(short_seq_ratio,
+								   max_seq_length,
+								   rand32_gen);
+			    seq_len = 0;
+			    num_sent = 0;
+			}
+                    } // for (auto sent_index=sent_index_first; ...
+                } // if (num_remain_sent > 1) {
+            } // for (int doc=0; doc < num_docs; ++doc) {
+        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+        if (!second) {
+	    if (verbose) {
+	        cout << "   number of empty documents: " << empty_docs <<
+		  endl << std::flush;
+		cout << "   number of documents with one sentence: " <<
+		  one_sent_docs << endl << std::flush;
+		cout << "   number of documents with long sentences: " <<
+		  long_sent_docs << endl << std::flush;
+		cout << "   will create mapping for " << map_index <<
+		  " samples" << endl << std::flush;
+	    }
+	    assert(maps == NULL);
+	    assert(num_samples < 0);
+            maps = new DocIdx[3*map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+    } // for (int iteration=0; iteration < 2; ++iteration) {
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i=(num_samples - 1); i > 0; --i) {
+      const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+      const auto i0 = 3 * i;
+      const auto j0 = 3 * j;
+      // Swap values.
+      swap(maps[i0], maps[j0]);
+      swap(maps[i0 + 1], maps[j0 + 1]);
+      swap(maps[i0 + 2], maps[j0 + 2]);
+    }
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void *mem_) {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem;
+        });
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                     {3*byte_size, byte_size}, // C-style contiguous strides
+                     maps, // the data pointer
+                     free_when_done); // numpy array references
+}
+py::array build_mapping(const py::array_t<int64_t>& docs_,
+                        const py::array_t<int>& sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+			const bool verbose) {
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) {
+	   cout << "    using uint64 for data mapping..." << endl << std::flush;
+	}
+	return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+					    max_num_samples, max_seq_length,
+					    short_seq_prob, seed, verbose);
+    } else {
+       if (verbose) {
+	   cout << "    using uint32 for data mapping..." << endl << std::flush;
+       }
+       return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+					   max_num_samples, max_seq_length,
+					   short_seq_prob, seed, verbose);
+    }
+}
+PYBIND11_MODULE(helpers, m) {
+    m.def("build_mapping", &build_mapping);
+}
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+import numpy as np
+import torch
+from megatron.utils import print_rank_0
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        return None
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+def make_dataset(path, impl, skip_warmup=False):
+    if impl == 'infer':
+        impl = infer_dataset_impl(path)
+    if impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    return None
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float,
+    7: np.double,
+    8: np.uint16
+}
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i+1)
+    return doc_idx
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1,)
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.doc_count = struct.unpack('<Q', f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+    #@lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+    def __len__(self):
+        return self._len
+    def num_tokens(self, index):
+        return self.sizes[index]
+    def size(self, index):
+        return self.sizes[index]
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+class IndexedCachedDataset(IndexedDataset):
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+    @property
+    def supports_prefetch(self):
+        return True
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx: ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+    #@lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx: ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float: 4,
+        np.double: 8
+    }
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+    def add_item(self, tensor):
+        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack('<Q', len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+                    return self
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+                    return pointers
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+            return _Writer()
+        def __init__(self, path, skip_warmup=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+        @property
+        def dtype(self):
+            return self._dtype
+        @property
+        def sizes(self):
+            return self._sizes
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+        def __len__(self):
+            return self._len
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+        self._do_init(path, skip_warmup)
+    def __getstate__(self):
+        return self._path
+    def __setstate__(self, state):
+        self._do_init(state)
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+    def __len__(self):
+        return len(self._index)
+    #@lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
+            if self._index.dtype != np.int64:
+                np_array = np_array.astype(np.int64)
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+    @property
+    def sizes(self):
+        return self._index.sizes
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+    def get_doc_idx(self):
+        return self._index._doc_idx
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+    @property
+    def supports_prefetch(self):
+        return False
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+        for size in index.sizes:
+            self._sizes.append(size)
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+    def finalize(self, index_file):
+        self._data_file.close()
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
--- a/megatron/data/preprocess_data.py
+++ b/megatron/data/preprocess_data.py
+import argparse
+import json
+import multiprocessing
+import nltk
+import sys
+import time
+import torch
+from bert_tokenization import FullTokenizer
+import indexed_dataset
+class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = FullTokenizer(self.args.vocab, do_lower_case=True)
+        spliter = nltk.load("tokenizers/punkt/english.pickle")
+        if self.args.keep_newlines:
+            # this prevents punkt from eating newlines after sentences
+            Encoder.spliter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                train_text = spliter._params,
+                lang_vars = CustomLanguageVars())
+        else:
+            Encoder.splitter = spliter
+    def encode(self, json_line):
+        text = json.loads(json_line)[self.args.json_key]
+        doc_ids = []
+        for sentence in Encoder.splitter.tokenize(text):
+            tokens = Encoder.tokenizer.tokenize(sentence)
+            ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
+            if len(ids) > 0:
+                doc_ids.append(ids)
+        return doc_ids, len(json_line)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', type=str, help='Path to input JSON')
+    parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
+    parser.add_argument('--json-key', type=str, default='text',
+                        help='Key to extract from json')
+    parser.add_argument('--output-prefix', type=str, help='Path to binary output file without suffix')
+    parser.add_argument('--workers', type=int, default=20,
+                        help='Number of worker processes to launch')
+    parser.add_argument('--log-interval', type=int, default=100,
+                        help='Interval between progress updates')
+    parser.add_argument('--keep-newlines', action='store_true',
+                        help='Keep newlines between sentences.')
+    parser.add_argument('--dataset-impl', type=str, default='mmap',
+                        choices=['lazy', 'cached', 'mmap'])
+    args = parser.parse_args()
+    args.keep_empty = False
+    startup_start = time.time()
+    print("Opening", args.input)
+    fin = open(args.input, 'r', encoding='utf-8')
+    nltk.download("punkt", quiet=True)
+    encoder = Encoder(args)
+    tokenizer = FullTokenizer(args.vocab, do_lower_case=True)
+    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+    encoded_docs = pool.imap(encoder.encode, fin, 25)
+    print(f"Vocab size: {tokenizer.vocab_size()}")
+    output_bin_file = "{}.bin".format(args.output_prefix)
+    output_idx_file = "{}.idx".format(args.output_prefix)
+    builder = indexed_dataset.make_builder(output_bin_file,
+                                      impl=args.dataset_impl,
+                                      vocab_size=tokenizer.vocab_size())
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+        for sentence in doc:
+            #print(sentence)
+            #print(tokenizer.convert_ids_to_tokens(sentence))
+            builder.add_item(torch.IntTensor(sentence))
+        builder.end_document()
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {i} documents",
+                  f"({i/elapsed} docs/s, {mbs} MB/s).",
+                  file=sys.stderr)
+    builder.finalize(output_idx_file)
+if __name__ == '__main__':
+    main()
--- a/megatron/data/test/test_indexed_dataset.py
+++ b/megatron/data/test/test_indexed_dataset.py
+import argparse
+import os
+import sys
+import torch
+script_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.join(script_dir, "../../../"))
+from megatron.data import indexed_dataset, FullBertTokenizer, AlbertDataset
+def test_indexed_dataset(args):
+    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
+    print(len(ds.doc_idx))
+    print(len(ds))
+    print(ds.doc_idx[-1])
+    if ds.supports_prefetch:
+        # just prefetch the whole thing in test (so assume it is small)
+        ds.prefetch(range(len(ds)))
+    for i in range(len(ds.doc_idx)-1):
+        start = ds.doc_idx[i]
+        end = ds.doc_idx[i+1]
+        ids = ds[start:end]
+        for s in ids:
+            assert len(s) > 0
+            l = s.data.tolist()
+            tokens = tokenizer.convert_ids_to_tokens(l)
+            for t in tokens:
+                if '\n' in t:
+                    print("Newline in string!")
+        print(i)
+def test_albert_dataset(args):
+    # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
+    # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    # ds = AlbertDataset(idataset, tokenizer)
+    ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
+                                  args.epochs, args.max_num_samples,
+                                  args.masked_lm_prob, args.seq_length,
+                                  args.short_seq_prob, args.seed)
+    truncated = 0
+    total = 0
+    for s in ds:
+        ids = s['text']
+        tokens = ds.tokenizer.convert_ids_to_tokens(ids)
+        print(tokens)
+        exit()
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, help='prefix to data files')
+    parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
+    parser.add_argument('--dataset-impl', type=str, default='infer',
+                        choices=['lazy', 'cached', 'mmap', 'infer'])
+    parser.add_argument('--epochs', type=int, default=5,
+                        help='Number of epochs to plan for')
+    parser.add_argument('--max-num-samples', type=int, default=None,
+                        help='Maximum number of samples to plan for')
+    parser.add_argument('--masked-lm-prob', type=float, default=0.15,
+                        help='probability of masking tokens')
+    parser.add_argument('--seq-length', type=int, default=512,
+                        help='maximum sequence length')
+    parser.add_argument('--short-seq-prob', type=float, default=0.1,
+                        help='probability of creating a short sequence')
+    parser.add_argument('--seed', type=int, default=1234,
+                        help='random seed')
+    args = parser.parse_args()
+    if args.dataset_impl == "infer":
+        args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
+    test_albert_dataset(args)
+#    test_indexed_dataset(args)
+if __name__ == "__main__":
+    main()
--- a/megatron/data/test/test_preprocess_data.sh
+++ b/megatron/data/test/test_preprocess_data.sh
+#!/bin/bash
+IMPL=cached
+python ../preprocess_data.py \
+       --input test_samples.json \
+       --vocab vocab.txt \
+       --dataset-impl ${IMPL} \
+       --output-prefix test_samples_${IMPL} \
+       --workers 1 \
+       --log-interval 2
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -145,7 +145,7 @@ class BertModel(MegatronModule):
            init_method=init_method,
            scaled_init_method=scaled_init_method_normal(init_method_std,
                                                         num_layers),
-            residual_connection_post_layernorm=True)
+            residual_connection_post_layernorm=False)
        self.lm_head = BertLMHead(
            self.language_model.embedding.word_embeddings.weight.size(0),

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -381,7 +381,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
    timers('interval time').start()
    report_memory_flag = True
    while iteration < args.train_iters:
        loss_dict, skipped_iter = train_step(forward_step_func,
                                             train_data_iterator,
                                             model,

--- a/pretrain_albert.py
+++ b/pretrain_albert.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pretrain ALBERT"""
+import torch
+import torch.nn.functional as F
+from megatron import mpu
+from megatron.model import BertModel
+from megatron.utils import print_rank_0
+from megatron.utils import reduce_losses
+from megatron.utils import vocab_size_with_padding
+from megatron.training import run
+from megatron.data.albert_dataset import build_train_valid_test_datasets
+from megatron.data_utils.samplers import DistributedBatchSampler
+def model_provider(args):
+    """Build the model."""
+    print_rank_0('building BERT model ...')
+    model = BertModel(
+        num_layers=args.num_layers,
+        vocab_size=args.vocab_size,
+        hidden_size=args.hidden_size,
+        num_attention_heads=args.num_attention_heads,
+        embedding_dropout_prob=args.hidden_dropout,
+        attention_dropout_prob=args.attention_dropout,
+        output_dropout_prob=args.hidden_dropout,
+        max_sequence_length=args.max_position_embeddings,
+        checkpoint_activations=args.checkpoint_activations,
+        checkpoint_num_layers=args.checkpoint_num_layers,
+        add_binary_head=True,
+        layernorm_epsilon=args.layernorm_epsilon,
+        num_tokentypes=args.tokentype_size,
+        parallel_output=True)
+    return model
+def get_batch(data_iterator, timers):
+    # Items and their type.
+    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask']
+    datatype = torch.int64
+    # Broadcast data.
+    timers('data loader').start()
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    timers('data loader').stop()
+    data_b = mpu.broadcast_data(keys, data, datatype)
+    # Unpack.
+    tokens = data_b['text'].long()
+    types = data_b['types'].long()
+    sentence_order = data_b['is_random'].long()
+    loss_mask = data_b['loss_mask'].float()
+    lm_labels = data_b['labels'].long()
+    padding_mask = data_b['padding_mask'].long()
+    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask
+def forward_step(data_iterator, model, args, timers):
+    """Forward step."""
+    # Get the batch.
+    timers('batch generator').start()
+    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \
+        = get_batch(data_iterator, timers)
+    timers('batch generator').stop()
+    # Forward model.
+    lm_logits, sop_logits = model(tokens, padding_mask, tokentype_ids=types)
+    sop_loss = F.cross_entropy(sop_logits.view(-1, 2).contiguous().float(),
+                               sentence_order.view(-1).contiguous(),
+                               ignore_index=-1)
+    lm_loss_ = mpu.vocab_parallel_cross_entropy(lm_logits.contiguous().float(),
+                                                lm_labels.contiguous())
+    lm_loss = torch.sum(
+        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+    loss = lm_loss + sop_loss
+    reduced_losses = reduce_losses([lm_loss, sop_loss])
+    return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]}
+def get_train_val_test_data(args):
+    """Load the data on rank zero and boradcast number of tokens to all GPUS."""
+    (train_data, valid_data, test_data) = (None, None, None)
+    # Data loader only on rank 0 of each model parallel group.
+    if mpu.get_model_parallel_rank() == 0:
+        print_rank_0('> building train, validation, and test datasets '
+                     'for ALBERT ...')
+        if args.data_loader is None:
+            args.data_loader = 'binary'
+        if args.data_loader != 'binary':
+            print('Unsupported {} data loader for ALBERT.'.format(
+                args.data_loader))
+            exit(1)
+        if not args.data_path:
+            print('ALBERT only supports a unified dataset specified '
+                  'with --data-path')
+            exit(1)
+        data_parallel_size = mpu.get_data_parallel_world_size()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        global_batch_size = args.batch_size * data_parallel_size
+        # Number of train/valid/test samples.
+        train_iters = args.train_iters
+        eval_iters = (train_iters // args.eval_interval + 1) * args.eval_iters
+        test_iters = args.eval_iters
+        train_val_test_num_samples = [args.train_iters * global_batch_size,
+                                      eval_iters * global_batch_size,
+                                      test_iters * global_batch_size]
+        print_rank_0(' > datasets target sizes (minimum size):')
+        print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
+        print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
+        print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
+        assert len(args.data_path) == 1
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            vocab_file=args.vocab,
+            data_prefix=args.data_path[0],
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            max_seq_length=args.seq_length,
+            masked_lm_prob=args.mask_prob,
+            short_seq_prob=args.short_seq_prob,
+            seed=args.seed,
+            skip_warmup=args.skip_mmap_warmup)
+        print_rank_0("> finished creating ALBERT datasets ...")
+        def make_data_loader_(dataset):
+            if not dataset:
+                return None
+            # Use a simple sampler with distributed batch sampler.
+            sampler = torch.utils.data.SequentialSampler(dataset)
+            batch_sampler = DistributedBatchSampler(
+                sampler=sampler,
+                batch_size=global_batch_size,
+                drop_last=True,
+                rank=data_parallel_rank,
+                world_size=data_parallel_size)
+            # Torch dataloader.
+            return torch.utils.data.DataLoader(dataset,
+                                               batch_sampler=batch_sampler,
+                                               num_workers=args.num_workers,
+                                               pin_memory=True)
+        train_data = make_data_loader_(train_ds)
+        valid_data = make_data_loader_(valid_ds)
+        test_data = make_data_loader_(test_ds)
+        do_train = train_data is not None and args.train_iters > 0
+        do_valid = valid_data is not None and args.eval_iters > 0
+        do_test = test_data is not None and args.eval_iters > 0
+        # Need to broadcast num_tokens and num_type_tokens.
+        num_tokens = vocab_size_with_padding(train_ds.num_tokens(), args)
+        token_counts = torch.cuda.LongTensor([num_tokens,
+                                              2, # hard coded num_type_tokens
+                                              int(do_train),
+                                              int(do_valid),
+                                              int(do_test)])
+    else:
+        token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])
+    # Broadcast num tokens.
+    torch.distributed.broadcast(token_counts,
+                                mpu.get_model_parallel_src_rank(),
+                                group=mpu.get_model_parallel_group())
+    args.vocab_size = token_counts[0].item()
+    args.tokentype_size = token_counts[1].item()
+    args.do_train = token_counts[2].item()
+    args.do_valid = token_counts[3].item()
+    args.do_test = token_counts[4].item()
+    return train_data, valid_data, test_data
+if __name__ == "__main__":
+    run('Pretrain BERT model', get_train_val_test_data,
+        model_provider, forward_step)
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -72,7 +72,7 @@ def get_batch(data_iterator, timers):
    next_sentence = data_b['is_random'].long()
    loss_mask = data_b['mask'].float()
    lm_labels = data_b['mask_labels'].long()
-    padding_mask = data_b['pad_mask'].byte()
+    padding_mask = data_b['pad_mask'].long()
    return tokens, types, next_sentence, loss_mask, lm_labels, padding_mask
@@ -112,17 +112,23 @@ def get_train_val_test_data(args):
    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
-        data_config = configure_data()
+        if (args.data_loader == 'raw'
-        ds_type = 'BERT'
+            or args.data_loader == 'lazy'
-        data_config.set_defaults(data_set_type=ds_type, transpose=False)
+            or args.data_loader == 'tfrecords'):
-        (train_data, val_data, test_data), tokenizer = data_config.apply(args)
+            data_config = configure_data()
-        num_tokens = vocab_size_with_padding(tokenizer.num_tokens, args)
+            ds_type = 'BERT'
-        # Need to broadcast num_tokens and num_type_tokens.
+            data_config.set_defaults(data_set_type=ds_type, transpose=False)
-        token_counts = torch.cuda.LongTensor([num_tokens,
+            (train_data, val_data, test_data), tokenizer = data_config.apply(args)
-                                              tokenizer.num_type_tokens,
+            num_tokens = vocab_size_with_padding(tokenizer.num_tokens, args)
-                                              int(args.do_train),
+            # Need to broadcast num_tokens and num_type_tokens.
-                                              int(args.do_valid),
+            token_counts = torch.cuda.LongTensor([num_tokens,
-                                              int(args.do_test)])
+                                                  tokenizer.num_type_tokens,
+                                                  int(args.do_train),
+                                                  int(args.do_valid),
+                                                  int(args.do_test)])
+        else:
+            print("Unsupported data loader for BERT.")
+            exit(1)
    else:
        token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])

--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
@@ -112,10 +112,16 @@ def get_train_val_test_data(args):
    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
-        if args.use_npy_data_loader:
+        if args.data_loader == 'numpy':
+            assert len(args.train_data) == 1
+            args.train_data = args.train_data[0]
+            assert len(args.valid_data) == 1
+            args.valid_data = args.valid_data[0]
+            assert len(args.test_data) == 1
+            args.test_data = args.test_data[0]
            (train_data, val_data, test_data), num_tokens, \
                eod_token = make_gpt2_dataloaders(args)
-        else:
+        elif args.data_loader == 'raw' or args.data_loader == 'lazy':
            data_config = configure_data()
            data_config.set_defaults(data_set_type='GPT2', transpose=False)
            (train_data, val_data, test_data), tokenizer = data_config.apply(
@@ -123,6 +129,9 @@ def get_train_val_test_data(args):
            num_tokens = tokenizer.num_tokens
            eod_token = tokenizer.get_command('eos').Id
            assert eod_token == tokenizer.get_command('pad').Id
+        else:
+            print("Unsupported data loader for GPT2.")
+            exit(1)
        # pad.
        num_tokens = vocab_size_with_padding(num_tokens, args)
        print_rank_0('> found end-of-document token: {}'.format(eod_token))

--- a/scripts/pretrain_albert.sh
+++ b/scripts/pretrain_albert.sh
+#!/bin/bash
+RANK=0
+WORLD_SIZE=1
+python pretrain_albert.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --batch-size 4 \
+       --seq-length 512 \
+       --max-preds-per-seq 80 \
+       --max-position-embeddings 512 \
+       --train-iters 10000 \
+       --save checkpoints/albert_117m \
+       --load checkpoints/albert_117m \
+       --resume-dataloader \
+       --data-path data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap \
+       --vocab data/megatron/vocab.txt \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --fp16 \
+       --fp32-layernorm \
+       --fp32-embedding \
+       --skip-mmap-warmup \
+       --num-workers 0
--- a/scripts/pretrain_albert_distributed.sh
+++ b/scripts/pretrain_albert_distributed.sh
+#!/bin/bash
+GPUS_PER_NODE=2
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_albert.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --batch-size 4 \
+       --seq-length 512 \
+       --max-preds-per-seq 80 \
+       --max-position-embeddings 512 \
+       --train-iters 10000 \
+       --save checkpoints/albert_117m \
+       --load checkpoints/albert_117m \
+       --resume-dataloader \
+       --data-path data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap \
+       --vocab data/megatron/vocab.txt \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --fp16 \
+       --fp32-layernorm \
+       --fp32-embedding \
+       --skip-mmap-warmup \
+       --num-workers 0