Merge branch 'megatron_sampler' into 'main'

Simplified sampler (will be needed later for batch size increase) and removed deprecated data stuff See merge request ADLR/megatron-lm!177

Merge branch 'megatron_sampler' into 'main'
Simplified sampler (will be needed later for batch size increase) and removed deprecated data stuff See merge request ADLR/megatron-lm!177
75bd9b54 · Jared Casper · ea81d62f · fac6718a · 75bd9b54 · 75bd9b54
Commit 75bd9b54 authored Dec 02, 2020 by Jared Casper
20 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -72,6 +72,9 @@ def parse_args(extra_args_provider=None, defaults={},
        print('using {} for parameters ...'.format(args.params_dtype),
              flush=True)
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
    # Set input defaults.
    for key in defaults:
@@ -133,14 +136,16 @@ def parse_args(extra_args_provider=None, defaults={},
 def _print_args(args):
    """Print arguments."""
    if args.rank == 0:
-        print('-------------------- arguments --------------------', flush=True)
+        print('------------------------ arguments ------------------------',
+              flush=True)
        str_list = []
        for arg in vars(args):
-            dots = '.' * (32 - len(arg))
+            dots = '.' * (48 - len(arg))
            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
        for arg in sorted(str_list, key=lambda x: x.lower()):
            print(arg, flush=True)
-        print('---------------- end of arguments ----------------', flush=True)
+        print('-------------------- end of arguments ---------------------',
+              flush=True)
 def _check_arg_is_not_none(args, arg):
@@ -275,7 +280,7 @@ def _add_learning_rate_args(parser):
                       'and initial warmup, the learing rate at each '
                       'iteration would be different.')
    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine', 'exponential'],
+                       choices=['constant', 'linear', 'cosine'],
                       help='Learning rate decay function.')
    group.add_argument('--lr-decay-iters', type=int, default=None,
                       help='number of iterations to decay learning rate over,'
@@ -397,8 +402,11 @@ def _add_validation_args(parser):
 def _add_data_args(parser):
    group = parser.add_argument_group(title='data and dataloader')
-    group.add_argument('--data-path', type=str, default=None,
+    group.add_argument('--data-path', nargs='*', default=None,
-                       help='Path to combined dataset to split.')
+                       help='Path to the training dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ...')
    group.add_argument('--split', type=str, default='969, 30, 1',
                       help='Comma-separated list of proportions for training,'
                       ' validation, and test split. For example the split '

--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -214,11 +214,14 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
                                 checkpoint_name))
                sys.exit()
    # Check arguments.
+    assert args.consumed_train_samples == 0
+    assert args.consumed_valid_samples == 0
    if 'args' in state_dict:
        checkpoint_args = state_dict['args']
        check_checkpoint_args(checkpoint_args)
+        args.consumed_train_samples = getattr(args, 'consumed_train_samples', 0)
+        args.consumed_valid_samples = getattr(args, 'consumed_valid_samples', 0)
    else:
        print_rank_0('could not find arguments in the checkpoint ...')

--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Blendable dataset."""
+import time
+import numpy as np
+import torch
+from megatron import print_rank_0
+from megatron import mpu
+class BlendableDataset(torch.utils.data.Dataset):
+    def __init__(self, datasets, weights):
+        self.datasets = datasets
+        num_datasets = len(datasets)
+        assert num_datasets == len(weights)
+        self.size = 0
+        for dataset in self.datasets:
+            self.size += len(dataset)
+        # Normalize weights.
+        weights = np.array(weights, dtype=np.float64)
+        sum_weights = np.sum(weights)
+        assert sum_weights > 0.0
+        weights /= sum_weights
+        # Build indecies.
+        start_time = time.time()
+        assert num_datasets < 255
+        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
+        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+        if torch.distributed.get_rank() == 0:
+            from megatron.data.dataset_utils import compile_helper
+            compile_helper()
+        # Simple barrier
+        tmp = torch.cuda.LongTensor([1])
+        torch.distributed.all_reduce(tmp, group=mpu.get_data_parallel_group())
+        from megatron.data import helpers
+        helpers.build_blending_indices(self.dataset_index,
+                                       self.dataset_sample_index,
+                                       weights, num_datasets, self.size,
+                                       torch.distributed.get_rank() == 0)
+        print_rank_0('> elapsed time for building blendable dataset indices: '
+                     '{:.2f} (sec)'.format(time.time() - start_time))
+    def __len__(self):
+        return self.size
+    def __getitem__(self, idx):
+        dataset_idx = self.dataset_index[idx]
+        sample_idx = self.dataset_sample_index[idx]
+        return self.datasets[dataset_idx][sample_idx]
--- a/megatron/data/data_loaders.py
+++ b/megatron/data/data_loaders.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataloaders."""
+import torch
+from megatron import get_args
+from megatron import mpu
+def build_pretraining_data_loader(dataset, consumed_samples):
+    """Buld dataloader given an input dataset."""
+    if dataset is None:
+        return None
+    args = get_args()
+    world_size = mpu.get_data_parallel_world_size()
+    global_batch_size = args.batch_size * world_size
+    # Megatron sampler
+    batch_sampler = MegatronPretrainingSampler(
+        total_samples=len(dataset),
+        consumed_samples=consumed_samples,
+        global_batch_size=global_batch_size,
+        rank=mpu.get_data_parallel_rank(),
+        world_size=world_size)
+    # Torch dataloader.
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=args.num_workers,
+                                       pin_memory=True)
+class MegatronPretrainingSampler:
+    def __init__(self, total_samples, consumed_samples,
+                 global_batch_size, rank, world_size):
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.global_batch_size = global_batch_size
+        self.rank = rank
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.consumed_samples < self.total_samples, \
+            'no samples left to consume: {}, {}'.format(self.consumed_samples,
+                                                        self.total_samples)
+        assert self.global_batch_size > 0, \
+            'Unexpected global batch size: {}'.format(self.global_batch_size)
+        assert world_size > 0,\
+            'non zero world size is expected: {}'.format(world_size)
+        assert self.rank < world_size,\
+            'rank should be smaller than world size: {}, {}'.format(
+                self.rank, world_size)
+        # Batch size per rank.
+        assert self.global_batch_size % world_size == 0,\
+            'global batch size must be divisible by world size: {}, {}'.format(
+                self.global_batch_size, world_size)
+        self.batch_size_per_rank = self.global_batch_size // world_size
+    def __len__(self):
+        return self.total_samples
+    def __iter__(self):
+        batch = []
+        # Last batch if not complete will be dropped.
+        for idx in range(self.consumed_samples, self.total_samples):
+            batch.append(idx)
+            if len(batch) == self.global_batch_size:
+                start_idx = self.rank * self.batch_size_per_rank
+                end_idx = start_idx + self.batch_size_per_rank
+                yield batch[start_idx:end_idx]
+                batch = []
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -18,11 +18,13 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
+import math
 import time
 import collections
 import numpy as np
 from megatron import get_args, print_rank_0
+from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 DSET_TYPE_STD = 'standard_bert'
@@ -31,6 +33,38 @@ DSET_TYPE_ICT = 'ict'
 DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]
+def get_datasets_weights_and_num_samples(data_prefix,
+                                         train_valid_test_num_samples):
+    # The data prefix should be in the format of:
+    #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
+    assert len(data_prefix) % 2 == 0
+    num_datasets = len(data_prefix) // 2
+    weights = [0]*num_datasets
+    prefixes = [0]*num_datasets
+    for i in range(num_datasets):
+        weights[i] = float(data_prefix[2*i])
+        prefixes[i] = (data_prefix[2*i+1]).strip()
+    # Normalize weights
+    weight_sum = 0.0
+    for weight in weights:
+        weight_sum += weight
+    assert weight_sum > 0.0
+    weights = [weight / weight_sum for weight in weights]
+    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # not uniformly distribute the number of samples, we still have
+    # samples left to feed to the network.
+    datasets_train_valid_test_num_samples = []
+    for weight in weights:
+        datasets_train_valid_test_num_samples.append(
+            [int(math.ceil(val * weight * 1.005))
+             for val in train_valid_test_num_samples])
+    return prefixes, weights, datasets_train_valid_test_num_samples
 def compile_helper():
    """Compile helper function ar runtime. Make sure this
    is invoked on a single process."""
@@ -360,6 +394,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                    short_seq_prob, seed, skip_warmup,
                                    dataset_type='standard_bert'):
+    if len(data_prefix) == 1:
+        return _build_train_valid_test_datasets(data_prefix[0],
+                                                data_impl, splits_string,
+                                                train_valid_test_num_samples,
+                                                max_seq_length, masked_lm_prob,
+                                                short_seq_prob, seed,
+                                                skip_warmup,
+                                                dataset_type=dataset_type)
+    # Blending dataset.
+    # Parse the values.
+    output = get_datasets_weights_and_num_samples(data_prefix,
+                                                  train_valid_test_num_samples)
+    prefixes, weights, datasets_train_valid_test_num_samples = output
+    # Build individual datasets.
+    train_datasets = []
+    valid_datasets = []
+    test_datasets = []
+    for i in range(len(prefixes)):
+        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+            prefixes[i], data_impl, splits_string,
+            datasets_train_valid_test_num_samples[i],
+            max_seq_length, masked_lm_prob, short_seq_prob,
+            seed, skip_warmup, dataset_type=dataset_type)
+    # Blend.
+    blending_train_dataset = BlendableDataset(train_datasets, weights)
+    blending_valid_dataset = BlendableDataset(valid_datasets, weights)
+    blending_test_dataset = BlendableDataset(test_datasets, weights)
+    return (blending_train_dataset, blending_valid_dataset,
+            blending_test_dataset)
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     max_seq_length, masked_lm_prob,
+                                     short_seq_prob, seed, skip_warmup,
+                                     dataset_type='standard_bert'):
    if dataset_type not in DSET_TYPES:
        raise ValueError("Invalid dataset_type: ", dataset_type)

--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -22,6 +22,8 @@ import numpy as np
 import torch
 from megatron import mpu, print_rank_0
+from megatron.data.blendable_dataset import BlendableDataset
+from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
@@ -31,6 +33,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                    seq_length, seed, skip_warmup):
    """Build train, valid, and test datasets."""
+    # Single dataset.
+    if len(data_prefix) == 1:
+        return _build_train_valid_test_datasets(data_prefix[0],
+                                                data_impl, splits_string,
+                                                train_valid_test_num_samples,
+                                                seq_length, seed, skip_warmup)
+    # Blending dataset.
+    # Parse the values.
+    output = get_datasets_weights_and_num_samples(data_prefix,
+                                                  train_valid_test_num_samples)
+    prefixes, weights, datasets_train_valid_test_num_samples = output
+    # Build individual datasets.
+    train_datasets = []
+    valid_datasets = []
+    test_datasets = []
+    for i in range(len(prefixes)):
+        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+            prefixes[i], data_impl, splits_string,
+            datasets_train_valid_test_num_samples[i],
+            seq_length, seed, skip_warmup)
+        train_datasets.append(train_ds)
+        valid_datasets.append(valid_ds)
+        test_datasets.append(test_ds)
+    # Blend.
+    blending_train_dataset = BlendableDataset(train_datasets, weights)
+    blending_valid_dataset = BlendableDataset(valid_datasets, weights)
+    blending_test_dataset = BlendableDataset(test_datasets, weights)
+    return (blending_train_dataset, blending_valid_dataset,
+            blending_test_dataset)
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     seq_length, seed, skip_warmup):
+    """Build train, valid, and test datasets."""
    # Indexed dataset.
    indexed_dataset = get_indexed_dataset_(data_prefix,
                                           data_impl,

--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -33,6 +33,69 @@ using namespace std;
 const int32_t LONG_SENTENCE_LEN = 512;
+void build_blending_indices(py::array_t<uint8_t>& dataset_index,
+			    py::array_t<int64_t>& dataset_sample_index,
+			    const py::array_t<double>& weights,
+			    const int32_t num_datasets,
+			    const int64_t size, const bool verbose) {
+  /* Given multiple datasets and a weighting array, build samples
+   such that it follows those wieghts.*/
+  if (verbose) {
+    std::cout << "> building indices for blendable datasets ..." << std::endl;
+  }
+  // Get the pointer access without the checks.
+  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+  auto weights_ptr = weights.unchecked<1>();
+  // Initialize buffer for number of samples used for each dataset.
+  int64_t current_samples[num_datasets];
+  for(int64_t i = 0; i < num_datasets; ++i) {
+    current_samples[i] = 0;
+  }
+  // For each sample:
+  for(int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
+    // Determine where the max error in sampling is happening.
+    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
+    int64_t max_error_index = 0;
+    double max_error = weights_ptr[0] * sample_idx_double -
+      static_cast<double>(current_samples[0]);
+    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
+      double error = weights_ptr[dataset_idx] * sample_idx_double -
+	static_cast<double>(current_samples[dataset_idx]);
+      if (error > max_error) {
+	max_error = error;
+	max_error_index = dataset_idx;
+      }
+    }
+    // Populate the indices.
+    dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
+    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
+    // Update the total samples.
+    current_samples[max_error_index] += 1;
+  }
+  // print info
+  if (verbose) {
+    std::cout << " > sample ratios:" << std::endl;
+    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
+      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
+	static_cast<double>(size);
+      std::cout << "   dataset " << dataset_idx << ", input: " <<
+	weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl; 
+    }
+  }
+}
 py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
 			   const py::array_t<int32_t>& doc_idx_,
 			   const int32_t seq_length,
@@ -640,4 +703,5 @@ PYBIND11_MODULE(helpers, m) {
    m.def("build_mapping", &build_mapping);
    m.def("build_blocks_mapping", &build_blocks_mapping);
    m.def("build_sample_idx", &build_sample_idx);
+    m.def("build_blending_indices", &build_blending_indices);
 }
--- a/megatron/data/realm_dataset_utils.py
+++ b/megatron/data/realm_dataset_utils.py
@@ -6,7 +6,6 @@ import torch
 from megatron import mpu, print_rank_0
 from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
-from megatron.data.samplers import DistributedBatchSampler
 from megatron import get_args, get_tokenizer, print_rank_0, mpu
@@ -23,6 +22,8 @@ def get_one_epoch_dataloader(dataset, batch_size=None):
    sampler = torch.utils.data.SequentialSampler(dataset)
    # importantly, drop_last must be False to get all the data.
+    assert False, 'DistributedBatchSampler deprecated, change the implementation'
+    from megatron.data.samplers import DistributedBatchSampler
    batch_sampler = DistributedBatchSampler(sampler,
                                            batch_size=global_batch_size,
                                            drop_last=False,

--- a/megatron/data/samplers.py
+++ b/megatron/data/samplers.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Batch samplers that work with either random or sequential data samplers."""
-import torch
-from torch.utils import data
-class RandomSampler(data.sampler.Sampler):
-    """Based off of pytorch RandomSampler and DistributedSampler. Essentially
-    a RandomSampler, but this class lets the user set an epoch like
-    DistributedSampler Samples elements randomly. If without replacement, then
-    sample from a shuffled dataset. If with replacement, then user can
-    specify ``num_samples`` to draw.
-    Arguments:
-        data_source (Dataset): dataset to sample from
-        num_samples (int): number of samples to draw, default=len(dataset)
-        replacement (bool): samples are drawn with replacement if ``True``,
-        default=False
-    """
-    def __init__(self, data_source, replacement=False, num_samples=None):
-        self.data_source = data_source
-        self.replacement = replacement
-        self._num_samples = num_samples
-        self.epoch = -1
-        if self._num_samples is not None and replacement is False:
-            raise ValueError("With replacement=False, num_samples should not "
-                             "be specified, since a random permute will be "
-                             "performed.")
-        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
-            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(
-                                 self.num_samples))
-        if not isinstance(self.replacement, bool):
-            raise ValueError("replacement should be a boolean value, but got "
-                             "replacement={}".format(self.replacement))
-    @property
-    def num_samples(self):
-        # dataset size might change at runtime
-        if self._num_samples is None:
-            return len(self.data_source)
-        return self._num_samples
-    def __iter__(self):
-        n = len(self.data_source)
-        g = torch.Generator()
-        if self.epoch >= 0:
-            g.manual_seed(self.epoch)
-        if self.replacement:
-            return iter(torch.randint(high=n, size=(self.num_samples,),
-                                      dtype=torch.int64, generator=g).tolist())
-        return iter(torch.randperm(n, generator=g).tolist())
-    def __len__(self):
-        return self.num_samples
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-class DistributedBatchSampler(data.sampler.BatchSampler):
-    """Similar to normal implementation of distributed sampler, except
-    implementation is at the batch sampler level, instead of just the
-    sampler level. This allows wrapping of arbitrary data samplers
-    (sequential, random, WeightedRandomSampler, etc.) with this batch
-    sampler.
-    The `interleave` argument specifies how to distribute a batch. A value
-    of True combined with the above random sampler is equivalent to pytorch's
-    torch.utils.data.distributed.DistributedSampler.
-    For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2 
-    specifying True will result in the following samples for each gpu:
-        GPU0: [0,2,4,6] GPU1: [1,3,5,7]
-    specifying False will result in the following samples:
-        GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
-    def __init__(self, sampler, batch_size, drop_last, rank=-1,
-                 world_size=2, wrap_last=False, interleave=False):
-        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
-                                                      drop_last)
-        if rank == -1:
-            assert False, 'should not be here'
-            rank = torch.distributed.get_rank()
-        self.rank = rank
-        self.world_size = world_size
-        self.sampler.wrap_around = 0
-        self.wrap_around = 0
-        self.wrap_last = wrap_last
-        self.start_iter = 0
-        self.interleave = interleave
-    def __iter__(self):
-        batch = []
-        i = 0
-        for idx in self.data_iterator(self.sampler, wrap_around=False):
-            batch.append(idx)
-            if len(batch) == self.batch_size:
-                tbatch = self._batch(batch)
-                if i >= self.start_iter:
-                    yield tbatch
-                    self.start_iter = 0
-                i += 1
-                batch = []
-        batch_len = len(batch)
-        if batch_len > 0 and not self.drop_last:
-            if self.wrap_last:
-                self.sampler.wrap_around -= (self.batch_size)
-                self.wrap_around += (len(batch))
-                self.wrap_around %= self.batch_size
-            yield self._batch(batch)
-        if self.wrap_last:
-            self.sampler.wrap_around += self.batch_size
-    def data_iterator(self, _iter, wrap_around=False):
-        """iterates through data and handles wrap around"""
-        for i, idx in enumerate(_iter):
-            if i < self.wrap_around % self.batch_size:
-                continue
-            if wrap_around:
-                self.wrap_around += 1
-                self.wrap_around %= self.batch_size
-            yield idx
-    def _batch(self, batch):
-        """extracts samples only pertaining to this worker's batch"""
-        if self.interleave:
-            return batch[self.rank:self.batch_size:self.world_size]
-        start = self.rank * self.batch_size // self.world_size
-        end = (self.rank + 1) * self.batch_size // self.world_size
-        return batch[start:end]
--- a/megatron/deprecated_data_utils/__init__.py
+++ b/megatron/deprecated_data_utils/__init__.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""utils for creating datasets"""
-import os
-import math
-import torch
-from .samplers import DistributedBatchSampler
-from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
-from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
-from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
-from . import corpora
-TRAIN_DATA = 0
-VAL_DATA = 1
-TEST_DATA = 2
-def should_split(split):
-    """
-    given split proportions checks if should split
-    Examples:
-    >>> should_split([10,0,0])
-    False
-    >>> should_split([1,.1,.2])
-    True
-    """
-    return max(split) / sum(split) != 1.
-def get_ext(path):
-    """gets path extension"""
-    return os.path.splitext(path)[1]
-def get_dataset(path, **kwargs):
-    """gets dataset object based on keyword args and file at `path`"""
-    if supported_corpus(path):
-        return corpora.NAMED_CORPORA[path](**kwargs)
-    ext = get_ext(path)
-    if '.json' in ext:
-        text = json_dataset(path, **kwargs)
-    elif ext in ['.csv', '.tsv']:
-        text = csv_dataset(path, **kwargs)
-    else:
-        raise NotImplementedError('data file type %s is not supported' % (ext))
-    return text
-def supported_corpus(corpus_name):
-    """checks if corpus name is defined in `corpora.py`"""
-    return corpus_name in corpora.NAMED_CORPORA
-def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
-                 delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
-                 tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
-                 model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
-                 parallel_group=None, **kwargs):
-    """function to create datasets+tokenizers for common options"""
-    if isinstance(process_fn, str):
-        process_fn = eval(process_fn)
-    if non_binary_cols is not None:
-        # multilabel dataset support (only for csvs)
-        label_key = non_binary_cols
-    def get_dataset_from_path(path_):
-        if lazy:
-            # get lazily loaded dataset
-            named_corpora = False
-            if supported_corpus(path_):
-                named_corpora = True
-                name = path_
-                path_ = corpora.NAMED_CORPORA[path_].PATH
-            if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
-                # create cached version of dataset for lazy loading if it doesn't exist
-                text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
-                                   delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
-                make_lazy(path_, text.X, data_type='data')
-            # This should be a barrier but nccl barrier assumes
-            # device_index=rank which is not the case for model
-            # parallel case
-            counts = torch.cuda.LongTensor([1])
-            torch.distributed.all_reduce(counts, group=parallel_group)
-            assert counts[0].item() == torch.distributed.get_world_size(
-                group=parallel_group)
-            text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
-        else:
-            # get dataset
-            text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
-                               delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
-        return text
-    # get one or multiple datasets and concatenate
-    if isinstance(path, str):
-        path = [path]
-    datasets = [get_dataset_from_path(p) for p in path]
-    if len(datasets) == 1:
-        ds = datasets[0]
-    else:
-        ds = ConcatDataset(datasets)
-    # make tokenizer for dataset
-    if tokenizer is None:
-        tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
-                                   pad_token, character_converage, **kwargs)
-    ds_type = ''
-    if 'ds_type' in kwargs:
-        ds_type = kwargs['ds_type']
-    ds.SetTokenizer(tokenizer)
-    # Split dataset into train/val/test (and wrap bert dataset)
-    if should_split(split):
-        ds = split_ds(ds, split)
-        if 'bert' in ds_type.lower():
-            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
-            dstype = bert_sentencepair_dataset
-            ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
-                  if d is not None else None for d in ds]
-        elif ds_type.lower() == 'gpt2':
-            ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
-    else:
-        if 'bert' in ds_type.lower():
-            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
-            dstype = bert_sentencepair_dataset
-            ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
-        elif ds_type.lower() == 'gpt2':
-            ds = GPT2Dataset(ds, max_seq_len=seq_length)
-    return ds, tokenizer
--- a/megatron/deprecated_data_utils/configure_data.py
+++ b/megatron/deprecated_data_utils/configure_data.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""parses arguments and preps data loader"""
-import copy
-import torch
-from megatron import data_utils
-from megatron import mpu
-class DataConfig:
-    def __init__(self, defaults={}):
-        super(DataConfig, self).__init__()
-        self.defaults = defaults
-    def apply(self, args):
-        if torch.distributed.get_rank() == 0:
-            print('configuring data')
-        self.apply_defaults(args)
-        return make_loaders(args)
-    def set_defaults(self, **kwargs):
-        for k, v in kwargs.items():
-            self.defaults[k] = v
-    def apply_defaults(self, args):
-        for k, v in self.defaults.items():
-            k = k.replace('-', '_')
-            if not hasattr(args, k):
-                setattr(args, k, v)
-def make_data_loader(dataset, batch_size, args):
-    shuffle = args.shuffle
-    if shuffle:
-        sampler = data_utils.samplers.RandomSampler(
-            dataset, replacement=True, num_samples=batch_size * args.train_iters)
-    else:
-        sampler = torch.utils.data.SequentialSampler(dataset)
-    world_size = torch.distributed.get_world_size(
-        group=mpu.get_data_parallel_group())
-    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
-    distributed = world_size > 1
-    drop_last = distributed
-    if distributed:
-        batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler,
-                                                                    batch_size,
-                                                                    drop_last,
-                                                                    rank,
-                                                                    world_size)
-    else:
-        batch_sampler = torch.utils.data.BatchSampler(sampler,
-                                                      batch_size,
-                                                      drop_last)
-    data_loader = torch.utils.data.DataLoader(dataset,
-                                              batch_sampler=batch_sampler,
-                                              num_workers=args.num_workers,
-                                              pin_memory=True)
-    return data_loader
-def make_tfrecord_loaders(args):
-    """Load train/val/test dataset from shuffled TFRecords"""
-    import data_utils.tf_dl
-    data_set_args = {'batch_size': args.batch_size,
-                     'max_seq_len': args.seq_length,
-                     'max_preds_per_seq': args.max_preds_per_seq,
-                     'train': True,
-                     'num_workers': max(args.num_workers, 1),
-                     'seed': args.seed + args.rank + 1,
-                     'threaded_dl': args.num_workers > 0
-                     }
-    train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
-                                                **data_set_args)
-    data_set_args['train'] = False
-    if args.eval_seq_length is not None:
-        data_set_args['max_seq_len'] = args.eval_seq_length
-    if args.eval_max_preds_per_seq is not None:
-        data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
-    valid = None
-    if args.valid_data is not None:
-        valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
-                                                    **data_set_args)
-    test = None
-    if args.test_data is not None:
-        test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
-                                                   **data_set_args)
-    tokenizer = data_utils.make_tokenizer(args.tokenizer_type,
-                                          train,
-                                          args.tokenizer_path,
-                                          args.vocab_size,
-                                          args.tokenizer_model_type,
-                                          cache_dir=args.cache_dir)
-    return (train, valid, test), tokenizer
-def make_loaders(args):
-    """makes training/val/test"""
-    if args.data_loader == 'tfrecords':
-        return make_tfrecord_loaders(args)
-    world_size = torch.distributed.get_world_size(
-        group=mpu.get_data_parallel_group())
-    batch_size = args.batch_size * world_size
-    eval_batch_size = batch_size
-    if args.eval_batch_size is not None:
-        eval_batch_size = args.eval_batch_size * world_size
-    seq_length = args.seq_length
-    if seq_length < 0:
-        seq_length = seq_length * world_size
-    eval_seq_length = args.eval_seq_length
-    if eval_seq_length is not None and eval_seq_length < 0:
-        eval_seq_length = eval_seq_length * world_size
-    split = get_split(args)
-    if args.data_path is not None:
-        args.train_data = args.data_path
-    data_set_args = {
-        'path': args.train_data,
-        'seq_length': seq_length,
-        'lazy': args.data_loader == 'lazy',
-        'delim': args.delim,
-        'text_key': args.text_key,
-        'label_key': 'label',
-        'non_binary_cols': None,
-        'ds_type': args.data_set_type,
-        'split': split,
-        'loose': args.loose_json,
-        'tokenizer_type': args.tokenizer_type,
-        'tokenizer_model_path': args.tokenizer_path,
-        'vocab_size': args.vocab_size,
-        'model_type': args.tokenizer_model_type,
-        'cache_dir': args.cache_dir,
-        'max_preds_per_seq': args.max_preds_per_seq,
-        'presplit_sentences': args.presplit_sentences,
-        'parallel_group': mpu.get_data_parallel_group()}
-    eval_set_args = copy.copy(data_set_args)
-    eval_set_args['split'] = [1.]
-    # if optional eval args were set then replace their
-    # equivalent values in the arg dict
-    if eval_seq_length:
-        eval_set_args['seq_length'] = eval_seq_length
-    if args.eval_max_preds_per_seq:
-        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
-    if args.eval_text_key is not None:
-        eval_set_args['text_key'] = args.eval_text_key
-    # make datasets splits and tokenizer
-    train = None
-    valid = None
-    test = None
-    if args.train_data is not None:
-        train, tokenizer = data_utils.make_dataset(**data_set_args)
-        if data_utils.should_split(split):
-            train, valid, test = train
-        eval_set_args['tokenizer'] = tokenizer
-    # make training and val dataset if necessary
-    if valid is None and args.valid_data is not None:
-        eval_set_args['path'] = args.valid_data
-        valid, tokenizer = data_utils.make_dataset(**eval_set_args)
-        eval_set_args['tokenizer'] = tokenizer
-    if test is None and args.test_data is not None:
-        eval_set_args['path'] = args.test_data
-        test, tokenizer = data_utils.make_dataset(**eval_set_args)
-    # wrap datasets with data loader
-    if train is not None and args.batch_size > 0:
-        train = make_data_loader(train, batch_size, args)
-        args.do_train = True
-    else:
-        args.do_train = False
-    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
-    if valid is not None:
-        valid = make_data_loader(valid, eval_batch_size, args)
-        args.do_valid = True
-    else:
-        args.do_valid = False
-    if test is not None:
-        test = make_data_loader(test, eval_batch_size, args)
-        args.do_test = True
-    else:
-        args.do_test = False
-    return (train, valid, test), tokenizer
-def get_split(args):
-    """
-    Get dataset splits from comma separated string list
-    """
-    splits = []
-    if args.split.find(',') != -1:
-        splits = [float(s) for s in args.split.split(',')]
-    elif args.split.find('/') != -1:
-        splits = [float(s) for s in args.split.split('/')]
-    else:
-        splits = [float(args.split)]
-    split_total = sum(splits)
-    if split_total < 1.:
-        splits.append(1 - split_total)
-    while len(splits) < 3:
-        splits.append(0.)
-    splits = splits[:3]
-    if args.valid_data is not None:
-        splits[1] = 0.
-    if args.test_data is not None:
-        splits[2] = 0.
-    final_sum = sum(splits)
-    return [s / final_sum for s in splits]
-def configure_data():
-    """add cmdline flags for configuring datasets"""
-    # These are options that are used by data_utils, but are either
-    # deprecated or not meant to be exposed to the command line user.
-    # These options are intneded to be set in code by specific scripts.
-    defaults = {
-        'world_size': 1,
-        'rank': -1,
-        'persist_state': 0,
-        'lazy': False,
-        'transpose': False,
-        'data_set_type': 'supervised',
-        'seq_length': 256,
-        'eval_seq_length': 256,
-        'samples_per_shard': 100
-    }
-    return DataConfig(defaults=defaults)
--- a/megatron/deprecated_data_utils/corpora.py
+++ b/megatron/deprecated_data_utils/corpora.py
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""several datasets with preset arguments"""
-from .datasets import json_dataset, csv_dataset
-import os
-class wikipedia(json_dataset):
-    """
-    dataset for wikipedia with arguments configured for convenience
-    command line usage: `--train-data wikipedia`
-    """
-    PATH = 'data/wikipedia/wikidump_lines.json'
-    assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
-    def __init__(self, **kwargs):
-        assert os.path.exists(wikipedia.PATH), \
-            wikipedia.assert_str
-        if not kwargs:
-            kwargs = {}
-        kwargs['text_key'] = 'text'
-        kwargs['loose_json'] = True
-        super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
-class webtext(json_dataset):
-    """
-    dataset for webtext with arguments configured for convenience
-    command line usage: `--train-data webtext`
-    """
-    PATH = 'data/webtext/data.json'
-    assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
-    def __init__(self, **kwargs):
-        assert os.path.exists(webtext.PATH), \
-            webtext.assert_str
-        if not kwargs:
-            kwargs = {}
-        kwargs['text_key'] = 'text'
-        kwargs['loose_json'] = True
-        super(webtext, self).__init__(webtext.PATH, **kwargs)
-NAMED_CORPORA = {
-    'wikipedia': wikipedia,
-    'webtext': webtext,
-}
--- a/megatron/deprecated_data_utils/datasets.py
+++ b/megatron/deprecated_data_utils/datasets.py
--- a/megatron/deprecated_data_utils/file_utils.py
+++ b/megatron/deprecated_data_utils/file_utils.py
-# This file is provided as is from:
-#   https://github.com/huggingface/pytorch-pretrained-BERT
-# Please refer to their repository for copyright.
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
-Copyright by the AllenNLP authors.
-"""
-from __future__ import (absolute_import, division, print_function, unicode_literals)
-import json
-import logging
-import os
-import shutil
-import tempfile
-from functools import wraps
-from hashlib import sha256
-import sys
-from io import open
-import boto3
-import requests
-from botocore.exceptions import ClientError
-from tqdm import tqdm
-try:
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
-try:
-    from pathlib import Path
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                                   Path.home() / '.pytorch_pretrained_bert'))
-except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    """
-    url_bytes = url.encode('utf-8')
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
-    if etag:
-        etag_bytes = etag.encode('utf-8')
-        etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
-    return filename
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-    meta_path = cache_path + '.json'
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata['url']
-    etag = metadata['etag']
-    return url, etag
-def cached_path(url_or_filename, cache_dir=None):
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    parsed = urlparse(url_or_filename)
-    if parsed.scheme in ('http', 'https', 's3'):
-        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir)
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        return url_or_filename
-    elif parsed.scheme == '':
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
-def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
-    return wrapper
-@s3_request
-def s3_etag(url):
-    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
-@s3_request
-def s3_get(url, temp_file):
-    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-def http_get(url, temp_file):
-    req = requests.get(url, stream=True)
-    content_length = req.headers.get('Content-Length')
-    total = int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total)
-    for chunk in req.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-def get_from_cache(url, cache_dir=None):
-    """
-    Given a URL, look for the corresponding dataset in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir)
-    # Get eTag to add to filename, if it exists.
-    if url.startswith("s3://"):
-        etag = s3_etag(url)
-    else:
-        response = requests.head(url, allow_redirects=True)
-        if response.status_code != 200:
-            raise IOError("HEAD request failed for url {} with status code {}"
-                          .format(url, response.status_code))
-        etag = response.headers.get("ETag")
-    filename = url_to_filename(url, etag)
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
-            # GET file object
-            if url.startswith("s3://"):
-                s3_get(url, temp_file)
-            else:
-                http_get(url, temp_file)
-            # we are copying the file before closing it, so flush to avoid truncation
-            temp_file.flush()
-            # shutil.copyfileobj() starts at the current position, so go to the start
-            temp_file.seek(0)
-            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
-            with open(cache_path, 'wb') as cache_file:
-                shutil.copyfileobj(temp_file, cache_file)
-            logger.info("creating metadata file for %s", cache_path)
-            meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
-            with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                json.dump(meta, meta_file)
-            logger.info("removing temp file %s", temp_file.name)
-    return cache_path
-def read_set_from_file(filename):
-    '''
-    Extract a de-duped collection (set) of text from a file.
-    Expected file format is one item per line.
-    '''
-    collection = set()
-    with open(filename, 'r', encoding='utf-8') as file_:
-        for line in file_:
-            collection.add(line.rstrip())
-    return collection
-def get_file_extension(path, dot=True, lower=True):
-    ext = os.path.splitext(path)[1]
-    ext = ext if dot else ext[1:]
-    return ext.lower() if lower else ext
--- a/megatron/deprecated_data_utils/lazy_loader.py
+++ b/megatron/deprecated_data_utils/lazy_loader.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""utils for loading text from disk"""
-import os
-import mmap
-import pickle as pkl
-import time
-from itertools import accumulate
-import torch
-from torch.multiprocessing import Lock
-def get_lazy_path(path):
-    """
-    Gets directory path where lazy files are stored.
-    """
-    return os.path.splitext(path)[0] + '.lazy'
-def exists_lazy(path, data_type='data'):
-    """
-    Check if we've already made a lazy version of this file for the `data_type` field.
-    """
-    if not os.path.exists(get_lazy_path(path)):
-        return False
-    contents = os.listdir(get_lazy_path(path))
-    if data_type not in contents:
-        return False
-    if data_type + '.len.pkl' not in contents:
-        return False
-    return True
-def make_lazy(path, strs, data_type='data'):
-    """
-    Make lazy version of `data_type` field of the file. Byte offsets
-    corresponding to data indices are stored in a `.len.pkl` data file.
-    """
-    lazypath = get_lazy_path(path)
-    if not os.path.exists(lazypath):
-        os.makedirs(lazypath)
-    datapath = os.path.join(lazypath, data_type)
-    lenpath = os.path.join(lazypath, data_type + '.len.pkl')
-    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
-        with open(datapath, 'wb') as f:
-            str_lens = []
-            str_cnt = 0
-            for s in strs:
-                if isinstance(s, dict):
-                    s = s['text']
-                encoded = s.encode('utf-8')
-                f.write(encoded)
-                str_cnt = len(encoded)
-                str_lens.append(str_cnt)
-        pkl.dump(str_lens, open(lenpath, 'wb'))
-    else:
-        while not os.path.exists(lenpath):
-            time.sleep(1)
-def split_strings(strings, start, chr_lens):
-    """
-    Split strings based on string lengths and given start.
-    """
-    return [strings[i - start:j - start] for i, j in zip([start] + chr_lens[:-1], chr_lens)]
-class ProcessorTokenizer:
-    """
-    callable class that runs a preprocessing, as well as tokenization step,
-    on input text.
-    """
-    def __init__(self, tokenizer, process_fn=None):
-        self.tokenizer = tokenizer
-        self.process_fn = process_fn
-    def __call__(self, string):
-        if self.tokenizer is not None:
-            string = self.tokenizer(string, process_fn=self.process_fn)
-        elif self.process_fn is not None:
-            string = self.process_fn(string)
-        return string
-class lazy_array_loader(object):
-    """
-    Arguments:
-        path: path to directory where array entries are concatenated into one big string file
-            and the .len file are located
-        data_type (str): Some datsets have multiple fields that are stored in different paths.
-            `data_type` specifies which of these fields to load in this class
-        mem_map  (boolean): Specifies whether to memory map file `path`
-        map_fn (callable): Fetched strings are passed through map_fn before being returned.
-    Example of lazy loader directory structure:
-    file.json
-    file.lazy/
-        data_type1
-        data_type1.len.pkl
-        data_type2
-        data_type2.len.pkl
-    """
-    def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
-        lazypath = get_lazy_path(path)
-        datapath = os.path.join(lazypath, data_type)
-        # get file where array entries are concatenated into one big string
-        self._file = open(datapath, 'rb', buffering=0)
-        self.file = self._file
-        # memory map file if necessary
-        self.mem_map = mem_map
-        if self.mem_map:
-            self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
-        lenpath = os.path.join(lazypath, data_type + '.len.pkl')
-        self.lens = pkl.load(open(lenpath, 'rb'))
-        self.ends = list(accumulate(self.lens))
-        self.dumb_ends = list(self.ends)
-        self.read_lock = Lock()
-        self.process_fn = map_fn
-        self.map_fn = map_fn
-        self._tokenizer = None
-    def SetTokenizer(self, tokenizer):
-        """
-        logic to set and remove (set to None) tokenizer.
-        combines preprocessing/tokenization into one callable.
-        """
-        if tokenizer is None:
-            if not hasattr(self, '_tokenizer'):
-                self._tokenizer = tokenizer
-        else:
-            self._tokenizer = tokenizer
-        self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
-    def GetTokenizer(self):
-        return self._tokenizer
-    def __getitem__(self, index):
-        """
-        read file and splice strings based on string ending array `self.ends`
-        """
-        if not isinstance(index, slice):
-            if index == 0:
-                start = 0
-            else:
-                start = self.ends[index - 1]
-            end = self.ends[index]
-            rtn = self.file_read(start, end)
-            if self.map_fn is not None:
-                return self.map_fn(rtn)
-        else:
-            # if slice, fetch strings with 1 diskread and then splice in memory
-            chr_lens = self.ends[index]
-            if index.start == 0 or index.start is None:
-                start = 0
-            else:
-                start = self.ends[index.start - 1]
-            stop = chr_lens[-1]
-            strings = self.file_read(start, stop)
-            rtn = split_strings(strings, start, chr_lens)
-            if self.map_fn is not None:
-                return self.map_fn([s for s in rtn])
-        return rtn
-    def __len__(self):
-        return len(self.ends)
-    def file_read(self, start=0, end=None):
-        """read specified portion of file"""
-        # atomic reads to avoid race conditions with multiprocess dataloader
-        self.read_lock.acquire()
-        # seek to start of file read
-        self.file.seek(start)
-        # read to end of file if no end point provided
-        if end is None:
-            rtn = self.file.read()
-        # else read amount needed to reach end point
-        else:
-            rtn = self.file.read(end - start)
-        self.read_lock.release()
-        # TODO: @raulp figure out mem map byte string bug
-        # if mem map'd need to decode byte string to string
-        rtn = rtn.decode('utf-8', 'ignore')
-        # rtn = str(rtn)
-        if self.mem_map:
-            rtn = rtn.decode('unicode_escape')
-        return rtn
--- a/megatron/deprecated_data_utils/samplers.py
+++ b/megatron/deprecated_data_utils/samplers.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""batch samplers that work with either random or sequential data samplers"""
-import math
-import os
-import sys
-import torch
-from torch.utils import data
-import numpy as np
-class RandomSampler(data.sampler.Sampler):
-    r"""
-    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
-    but this class lets the user set an epoch like DistributedSampler
-    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
-    If with replacement, then user can specify ``num_samples`` to draw.
-    Arguments:
-        data_source (Dataset): dataset to sample from
-        num_samples (int): number of samples to draw, default=len(dataset)
-        replacement (bool): samples are drawn with replacement if ``True``, default=False
-    """
-    def __init__(self, data_source, replacement=False, num_samples=None):
-        self.data_source = data_source
-        self.replacement = replacement
-        self._num_samples = num_samples
-        self.epoch = -1
-        if self._num_samples is not None and replacement is False:
-            raise ValueError("With replacement=False, num_samples should not be specified, "
-                             "since a random permute will be performed.")
-        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
-            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(self.num_samples))
-        if not isinstance(self.replacement, bool):
-            raise ValueError("replacement should be a boolean value, but got "
-                             "replacement={}".format(self.replacement))
-    @property
-    def num_samples(self):
-        # dataset size might change at runtime
-        if self._num_samples is None:
-            return len(self.data_source)
-        return self._num_samples
-    def __iter__(self):
-        n = len(self.data_source)
-        g = torch.Generator()
-        if self.epoch >= 0:
-            g.manual_seed(self.epoch)
-        if self.replacement:
-            return iter(torch.randint(high=n, size=(self.num_samples,),
-                                      dtype=torch.int64, generator=g).tolist())
-        return iter(torch.randperm(n, generator=g).tolist())
-    def __len__(self):
-        return self.num_samples
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-class DistributedBatchSampler(data.sampler.BatchSampler):
-    """
-    similar to normal implementation of distributed sampler, except implementation is at the
-    batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
-    data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
-    """
-    def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False):
-        super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
-        if rank == -1:
-            assert False, 'should not be here'
-            rank = torch.distributed.get_rank()
-        self.rank = rank
-        self.world_size = world_size
-        self.sampler.wrap_around = 0
-        self.wrap_around = 0
-        self.wrap_last = wrap_last
-        self.start_iter = 0
-    def __iter__(self):
-        batch = []
-        last_batch = None
-        i = 0
-        for idx in self.data_iterator(self.sampler, wrap_around=False):
-            batch.append(idx)
-            if len(batch) == self.batch_size:
-                tbatch = self._batch(batch)
-                if i >= self.start_iter:
-                    yield tbatch
-                    self.start_iter = 0
-                i += 1
-                last_batch = np.array(list(tbatch))
-                batch = []
-        batch_len = len(batch)
-        if batch_len > 0 and not self.drop_last:
-            if self.wrap_last:
-                self.sampler.wrap_around -= (self.batch_size)
-                self.wrap_around += (len(batch))
-                self.wrap_around %= self.batch_size
-                if isinstance(self.sampler, TransposedSampler):
-                    for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)):
-                        if i == 0:
-                            continue
-                        batch.append(idx)
-                        new_batch_len = len(batch)
-                        if len(batch) == self.batch_size:
-                            break
-            yield self._batch(batch)
-        if self.wrap_last:
-            self.sampler.wrap_around += self.batch_size
-    def data_iterator(self, _iter, wrap_around=False):
-        """iterates through data and handles wrap around"""
-        for i, idx in enumerate(_iter):
-            if i < self.wrap_around % self.batch_size:
-                continue
-            if wrap_around:
-                self.wrap_around += 1
-                self.wrap_around %= self.batch_size
-            yield idx
-    def _batch(self, batch):
-        """extracts samples only pertaining to this worker's batch"""
-        start = self.rank * self.batch_size // self.world_size
-        end = (self.rank + 1) * self.batch_size // self.world_size
-        return batch[start:end]
--- a/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
+++ b/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
-"""
-Usage:
-python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
-"""
-import sys
-import json
-import nltk
-nltk.download('punkt')
-input_file = sys.argv[1]
-output_file = sys.argv[2]
-line_seperator = "\n"
-with open(input_file, 'r') as ifile:
-    with open(output_file, "w") as ofile:
-        for doc in ifile.readlines():
-            parsed = json.loads(doc)
-            sent_list = []
-            for line in parsed['text'].split('\n'):
-                if line != '\n':
-                    sent_list.extend(nltk.tokenize.sent_tokenize(line))
-            parsed['text'] = line_seperator.join(sent_list)
-            ofile.write(json.dumps(parsed) + '\n')
--- a/megatron/deprecated_data_utils/scripts/split_gpt2_json.py
+++ b/megatron/deprecated_data_utils/scripts/split_gpt2_json.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Takes a corpora of files (specified by `--input_files`) with json data separated
-by newlines (loose json). Splits data into train.json, val.json, test.json files
-under `output_dir`.
-Note: This code has the potential to override files with the names
-train.json, val.json, test.json in `--output_dir`.
-"""
-import os
-import argparse
-import math
-import random
-parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
-parser.add_argument('--input_files', nargs='+', required=True,
-                    help='whitespace separated list of input data files')
-parser.add_argument('--output_dir', required=True,
-                    help='output directory where to put files')
-parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
-                    help='percentage of available data to use for val/test dataset')
-args = parser.parse_args()
-def get_lines(filepath):
-    lines = []
-    with open(filepath, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            l = l.strip()
-            lines.append(l)
-    return lines
-def get_splits(lines, line_counts):
-    all_lines = []
-    line_idx = []
-    file_mappings = []
-    for i, l in enumerate(lines):
-        all_lines.extend(l)
-        line_idx.extend(list(range(len(l))))
-        file_mappings.extend([i] * len(l))
-    indices = list(range(len(all_lines)))
-    random.shuffle(indices)
-    all_lines = [all_lines[idx] for idx in indices]
-    line_idx = [line_idx[idx] for idx in indices]
-    file_mappings = [file_mappings[idx] for idx in indices]
-    splits = []
-    mappings = []
-    start = 0
-    for end in line_counts:
-        end += start
-        splits.append(all_lines[start:end])
-        mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
-        start = end
-    return splits, mappings
-def format_mappings(line_idx, file_mappings):
-    lines = []
-    for m, l in zip(file_mappings, line_idx):
-        lines.append(str(m).strip() + '\t' + str(l).strip())
-    return lines
-def get_filepaths(filepaths, output_dir):
-    paths = []
-    train_path = 'train.json'
-    dev_path = 'dev.json'
-    test_path = 'test.json'
-    paths.append(os.path.join(output_dir, train_path))
-    paths.append(os.path.join(output_dir, dev_path))
-    paths.append(os.path.join(output_dir, test_path))
-    return paths
-def write_files(lines, mappings, filepaths):
-    for l, m, path in zip(lines, mappings, filepaths):
-        write_file(l, path)
-        write_mapping_file(m, path)
-def write_file(lines, path):
-    print('Writing:', path)
-    with open(path, 'w') as f:
-        for l in lines:
-            f.write(l + '\n')
-def write_mapping_file(m, path):
-    path = path + '.map'
-    m = [get_mapping_header()] + m
-    write_file(m, path)
-def get_mapping_header():
-    return 'file\tline #'
-if not os.path.exists(args.output_dir):
-    os.makedirs(args.output_dir)
-lines = []
-for filepath in args.input_files:
-    _lines = get_lines(filepath)
-    lines.append(_lines)
-# calculate number of lines to use for each
-line_counts = [len(l) for l in lines]
-total_lines = sum(line_counts)
-dev_percent = args.test_percent[0]
-dev_lines = math.ceil(dev_percent * total_lines)
-test_percent = 0
-if len(args.test_percent) == 2:
-    test_percent = args.test_percent[1]
-test_lines = math.ceil(test_percent * total_lines)
-train_lines = total_lines - (test_lines + dev_lines)
-normed_lines = [train_lines, dev_lines, test_lines]
-normed_lines = [int(l) for l in normed_lines]
-splits, mappings = get_splits(lines, normed_lines)
-filepaths = get_filepaths(args.input_files, args.output_dir)
-print('Writing output to:', filepaths)
-write_files(splits, mappings, filepaths)
--- a/megatron/deprecated_data_utils/scripts/split_json.py
+++ b/megatron/deprecated_data_utils/scripts/split_json.py
-"""
-Takes a corpora of files (specified by `--input_files`) with json data separated
-by newlines (loose json). Splits data into train.json, val.json, test.json files
-under `output_dir`.
-Note: This code has the potential to override files with the names
-train.json, val.json, test.json in `--output_dir`.
-"""
-import os
-import argparse
-import math
-import random
-parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
-parser.add_argument('--input_files', nargs='+', required=True,
-                    help='whitespace separated list of input data files')
-parser.add_argument('--output_dir', required=True,
-                    help='output directory where to put files')
-parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
-                    help='percentage of available data to use for val/test dataset')
-args = parser.parse_args()
-def get_lines(filepath):
-    lines = []
-    with open(filepath, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            l = l.strip()
-            lines.append(l)
-    return lines
-def get_splits(lines, line_counts):
-    all_lines = []
-    line_idx = []
-    file_mappings = []
-    for i, l in enumerate(lines):
-        all_lines.extend(l)
-        line_idx.extend(list(range(len(l))))
-        file_mappings.extend([i] * len(l))
-    indices = list(range(len(all_lines)))
-    random.shuffle(indices)
-    all_lines = [all_lines[idx] for idx in indices]
-    line_idx = [line_idx[idx] for idx in indices]
-    file_mappings = [file_mappings[idx] for idx in indices]
-    splits = []
-    mappings = []
-    start = 0
-    for end in line_counts:
-        end += start
-        splits.append(all_lines[start:end])
-        mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
-        start = end
-    return splits, mappings
-def format_mappings(line_idx, file_mappings):
-    lines = []
-    for m, l in zip(file_mappings, line_idx):
-        lines.append(str(m).strip() + '\t' + str(l).strip())
-    return lines
-def get_filepaths(filepaths, output_dir):
-    paths = []
-    train_path = 'train.json'
-    dev_path = 'dev.json'
-    test_path = 'test.json'
-    paths.append(os.path.join(output_dir, train_path))
-    paths.append(os.path.join(output_dir, dev_path))
-    paths.append(os.path.join(output_dir, test_path))
-    return paths
-def write_files(lines, mappings, filepaths):
-    for l, m, path in zip(lines, mappings, filepaths):
-        write_file(l, path)
-        write_mapping_file(m, path)
-def write_file(lines, path):
-    print('Writing:', path)
-    with open(path, 'w') as f:
-        for l in lines:
-            f.write(l + '\n')
-def write_mapping_file(m, path):
-    path = path + '.map'
-    m = [get_mapping_header()] + m
-    write_file(m, path)
-def get_mapping_header():
-    return 'file\tline #'
-if not os.path.exists(args.output_dir):
-    os.makedirs(args.output_dir)
-lines = []
-for filepath in args.input_files:
-    _lines = get_lines(filepath)
-    lines.append(_lines)
-# calculate number of lines to use for each
-line_counts = [len(l) for l in lines]
-total_lines = sum(line_counts)
-dev_percent = args.test_percent[0]
-dev_lines = math.ceil(dev_percent * total_lines)
-test_percent = 0
-if len(args.test_percent) == 2:
-    test_percent = args.test_percent[1]
-test_lines = math.ceil(test_percent * total_lines)
-train_lines = total_lines - (test_lines + dev_lines)
-normed_lines = [train_lines, dev_lines, test_lines]
-normed_lines = [int(l) for l in normed_lines]
-splits, mappings = get_splits(lines, normed_lines)
-filepaths = get_filepaths(args.input_files, args.output_dir)
-print('Writing output to:', filepaths)
-write_files(splits, mappings, filepaths)
--- a/megatron/deprecated_data_utils/tf_dl.py
+++ b/megatron/deprecated_data_utils/tf_dl.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch DataLoader for TFRecords"""
-import numpy as np
-import torch
-import queue
-import threading
-import tensorflow as tf
-tf.enable_eager_execution()
-class TFRecordDataLoader(object):
-    def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq,
-                 train, num_workers=2, seed=1, threaded_dl=False):
-        assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords"
-        tf.set_random_seed(seed)
-        if isinstance(records, str):
-            records = [records]
-        self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
-                                                "input_mask": tf.FixedLenFeature([max_seq_len], tf.int64),
-                                                "segment_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
-                                                "masked_lm_positions": tf.FixedLenFeature([max_preds_per_seq], tf.int64),
-                                                "masked_lm_ids": tf.FixedLenFeature([max_preds_per_seq], tf.int64),
-                                                "masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32),
-                                                "next_sentence_labels": tf.FixedLenFeature([1], tf.int64)})
-        # Instantiate dataset according to original BERT implementation
-        if train:
-            self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records))
-            self.dataset = self.dataset.repeat()
-            self.dataset = self.dataset.shuffle(buffer_size=len(records))
-            # use sloppy tfrecord dataset
-            self.dataset = self.dataset.apply(
-                tf.contrib.data.parallel_interleave(
-                    tf.data.TFRecordDataset,
-                    sloppy=train,
-                    cycle_length=min(num_workers, len(records))))
-            self.dataset = self.dataset.shuffle(buffer_size=100)
-        else:
-            self.dataset = tf.data.TFRecordDataset(records)
-            self.dataset = self.dataset.repeat()
-        # Instantiate dataloader (do not drop remainder for eval)
-        loader_args = {'batch_size': batch_size,
-                       'num_parallel_batches': num_workers,
-                       'drop_remainder': train}
-        self.dataloader = self.dataset.apply(
-            tf.contrib.data.map_and_batch(
-                self.record_converter, **loader_args))
-        self.threaded_dl = threaded_dl
-        self.num_workers = num_workers
-    def __iter__(self):
-        if self.threaded_dl:
-            data_iter = iter(MultiprocessLoader(self.dataloader, self.num_workers))
-            for item in data_iter:
-                yield item
-        else:
-            data_iter = iter(self.dataloader)
-            for item in data_iter:
-                yield convert_tf_example_to_torch_tensors(item)
-class Record2Example(object):
-    def __init__(self, feature_map):
-        self.feature_map = feature_map
-    def __call__(self, record):
-        """Decodes a BERT TF record to a TF example."""
-        example = tf.parse_single_example(record, self.feature_map)
-        for k, v in list(example.items()):
-            if v.dtype == tf.int64:
-                example[k] = tf.to_int32(v)
-        return example
-def convert_tf_example_to_torch_tensors(example):
-    item = {k: (v.numpy()) for k, v in example.items()}
-    mask = np.zeros_like(item['input_ids'])
-    mask_labels = np.ones_like(item['input_ids']) * -1
-    for b, row in enumerate(item['masked_lm_positions'].astype(int)):
-        for i, idx in enumerate(row):
-            if item['masked_lm_weights'][b, i] != 0:
-                mask[b, idx] = 1
-                mask_labels[b, idx] = item['masked_lm_ids'][b, i]
-    output = {'text': item['input_ids'], 'types': item['segment_ids'], 'is_random': item['next_sentence_labels'],
-              'pad_mask': 1 - item['input_mask'], 'mask': mask, 'mask_labels': mask_labels}
-    return {k: torch.from_numpy(v) for k, v in output.items()}
-class MultiprocessLoader(object):
-    def __init__(self, dataloader, num_workers=2):
-        self.dl = dataloader
-        self.queue_size = 2 * num_workers
-    def __iter__(self):
-        output_queue = queue.Queue(self.queue_size)
-        output_thread = threading.Thread(target=_multiproc_iter,
-                                         args=(self.dl, output_queue))
-        output_thread.daemon = True
-        output_thread.start()
-        while output_thread.is_alive():
-            yield output_queue.get(block=True)
-        else:
-            print(RuntimeError('TF record data loader thread exited unexpectedly'))
-def _multiproc_iter(dl, output_queue):
-    data_iter = iter(dl)
-    for item in data_iter:
-        tensors = convert_tf_example_to_torch_tensors(item)
-        output_queue.put(tensors, block=True)