added write option for index map

9ea9d50f · Mohammad Shoeybi · 690291dd · 9ea9d50f · 9ea9d50f
Commit 9ea9d50f authored Nov 20, 2019 by Mohammad Shoeybi
Hide whitespace changes
Inline Side-by-side

Showing with 115 additions and 55 deletions

megatron/data/albert_dataset.py megatron/data/albert_dataset.py +102 -47

pretrain_albert.py pretrain_albert.py +13 -8

No files found.
--- a/megatron/data/albert_dataset.py
+++ b/megatron/data/albert_dataset.py
@@ -2,6 +2,7 @@

 import random
 import time
+import os

 import numpy as np
 import torch
@@ -11,55 +12,37 @@ from .dataset_utils import build_training_sample
 #from data.mapping import build_training_samples_mapping

 from . import helpers
-from megatron.data import FullBertTokenizer, indexed_dataset
+from megatron.data import FullBertTokenizer
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.utils import print_rank_0


 class AlbertDataset(Dataset):

-    def __init__(self, indexed_dataset, tokenizer, num_epochs, max_num_samples,
+
+    def __init__(self,
+                 vocab_file, data_prefix, data_impl, skip_warmup,
+                 num_epochs, max_num_samples,
                 masked_lm_prob, max_seq_length, short_seq_prob, seed):

        # Params to store.
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
-        self.tokenizer = tokenizer
+        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=True)

        # Indexed dataset.
-        self.indexed_dataset = indexed_dataset
+        self.indexed_dataset = self._get_indexed_dataset(data_prefix, data_impl,
+                                                         skip_warmup)

        # Build the samples mapping.
-        if not num_epochs:
-            if not max_num_samples:
-                raise ValueError("Need to specify either max_num_samples "
-                                 "or num_epochs")
-            num_epochs = np.iinfo(np.int32).max - 1
-        if not max_num_samples:
-            max_num_samples = np.iinfo(np.int64).max - 1
-
-        # Make sure the types match the helpers input types.
-        assert indexed_dataset.doc_idx.dtype == np.int64
-        assert indexed_dataset.sizes.dtype == np.int32
-
-        # Build samples mapping
-        verbose = torch.distributed.get_rank()==0
-        start_time = time.time()
-        self.samples_mapping = helpers.build_mapping(
-            indexed_dataset.doc_idx,
-            indexed_dataset.sizes,
-            num_epochs,
-            max_num_samples,
-            self.max_seq_length-3, # account for added tokens
-            short_seq_prob,
-            self.seed,
-            verbose)
-        # Make sure all the ranks have built the mapping
-        torch.distributed.barrier()
-        print_rank_0('> elasped time to build samples mapping (seconds): '
-                     '{:2f}'.format(time.time() - start_time))
-
-        exit()
+        self.samples_mapping = self._get_samples_mapping(self.indexed_dataset,
+                                                         data_prefix,
+                                                         num_epochs,
+                                                         max_num_samples,
+                                                         self.max_seq_length,
+                                                         short_seq_prob,
+                                                         self.seed)

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
@@ -68,27 +51,19 @@ class AlbertDataset(Dataset):
        self.sep_id = tokenizer.vocab['[SEP]']
        self.mask_id = tokenizer.vocab['[MASK]']
        self.pad_id = tokenizer.vocab['[PAD]']
+        exit()


-    @classmethod
-    def from_paths(cls, vocab, data_prefix, data_impl,
-                   num_epochs, max_num_samples, masked_lm_prob,
-                   max_seq_length, short_seq_prob, seed, skip_warmup=False):
-        tokenizer = FullBertTokenizer(vocab, do_lower_case=True)
-        print_rank_0("> Reading dataset index ...")
-        idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl,
-                                              skip_warmup)
-        print_rank_0("> Finished creating indexed dataset")
-        return cls(idx_ds, tokenizer, num_epochs, max_num_samples,
-                   masked_lm_prob, max_seq_length, short_seq_prob, seed)
-
    def num_tokens(self):
        return self.tokenizer.vocab_size()

+
    def __len__(self):
        return self.samples_mapping.shape[0]

+
    def __getitem__(self, idx):
+
        rng = random.Random(self.seed + idx)
        start_index, end_index, seq_length = self.samples_mapping[idx]
        sample = []
@@ -98,13 +73,93 @@ class AlbertDataset(Dataset):
            if len(s) > 1000:
                print(self.tokenizer.convert_ids_to_tokens(s))
        return build_training_sample(sample, seq_length,
-                                     self.max_seq_length,
+                                     self.max_seq_length, # needed for padding
                                     self.vocab_id_list,
                                     self.vocab_id_to_token_dict,
                                     self.cls_id, self.sep_id,
                                     self.mask_id, self.pad_id,
                                     self.masked_lm_prob, rng)

+
+
+    def _get_indexed_dataset(self, data_prefix, data_impl, skip_warmup):
+        start_time = time.time()
+        print_rank_0("> Reading dataset index ...")
+        indexed_dataset = make_indexed_dataset(data_prefix,
+                                               data_impl,
+                                               skip_warmup)
+        print_rank_0("> Finished creating indexed dataset in {:4f} "
+                     "seconds".format(time.time() - start_time))
+        return indexed_dataset
+
+
+    def _get_samples_mapping(self,
+                             indexed_dataset,
+                             data_prefix,
+                             num_epochs,
+                             max_num_samples,
+                             max_seq_length,
+                             short_seq_prob,
+                             seed):
+        if not num_epochs:
+            if not max_num_samples:
+                raise ValueError("Need to specify either max_num_samples "
+                                 "or num_epochs")
+            num_epochs = np.iinfo(np.int32).max - 1
+        if not max_num_samples:
+            max_num_samples = np.iinfo(np.int64).max - 1
+
+        # Filename of the index mapping
+        indexmap_filename = data_prefix
+        indexmap_filename += '_indexmap'
+        indexmap_filename += '_{}ep'.format(num_epochs)
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+        indexmap_filename += '_{}msl'.format(max_seq_length)
+        indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
+        indexmap_filename += '_{}s'.format(seed)
+        indexmap_filename += '.npy'
+
+        # Build the indexed mapping if not exist.
+        if torch.distributed.get_rank() == 0 and \
+           not os.path.isfile(indexmap_filename):
+            print('WARNING: could not find index map file {}, building '
+                  'the indices on rank 0 ...'.format(indexmap_filename))
+            # Make sure the types match the helpers input types.
+            assert indexed_dataset.doc_idx.dtype == np.int64
+            assert indexed_dataset.sizes.dtype == np.int32
+
+            # Build samples mapping
+            verbose = torch.distributed.get_rank()==0
+            start_time = time.time()
+            samples_mapping = helpers.build_mapping(
+                indexed_dataset.doc_idx,
+                indexed_dataset.sizes,
+                num_epochs,
+                max_num_samples,
+                max_seq_length-3, # account for added tokens
+                short_seq_prob,
+                seed,
+                verbose)
+            np.save(indexmap_filename, samples_mapping, allow_pickle=True)
+            # Make sure all the ranks have built the mapping
+            print_rank_0('> elasped time to build and save samples mapping '
+                         '(seconds): {:4f}'.format(
+                             time.time() - start_time))
+        torch.distributed.barrier()
+
+        # Load indexed dataset.
+        print_rank_0('> loading indexed mapping from {}'.format(
+            indexmap_filename))
+        start_time = time.time()
+        samples_mapping = np.load(indexmap_filename, allow_pickle=True)
+        print_rank_0('  loaded indexed file in {:3.3f} seconds'.format(
+            time.time() - start_time))
+        print_rank_0('  total number of samples: {}'.format(
+            samples_mapping.shape[0]))
+
+        return samples_mapping
+
+
 '''
 def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
    """With probability `short_seq_prob` generate a smaller sequence lenght."""

--- a/pretrain_albert.py
+++ b/pretrain_albert.py
@@ -121,14 +121,19 @@ def get_train_val_test_data(args):
            if not args.data_path:
                print("Albert currently only supports a unified dataset specified with --data-path")
                exit(1)
-            print("Creating AlbertDataset...")
-            full_data = AlbertDataset.from_paths(args.vocab, args.data_path,
-                                                 args.data_impl, args.data_epochs,
-                                                 args.max_num_samples,
-                                                 args.mask_prob, args.seq_length,
-                                                 args.short_seq_prob,
-                                                 args.seed, args.skip_mmap_warmup)
-            print("Finished creating AlbertDataset...")
+            print_rank_0("Creating AlbertDataset...")
+            full_data = AlbertDataset(
+                vocab_file=args.vocab,
+                data_prefix=args.data_path,
+                data_impl=args.data_impl,
+                skip_warmup=args.skip_mmap_warmup,
+                num_epochs=args.data_epochs,
+                max_num_samples=args.max_num_samples,
+                masked_lm_prob=args.mask_prob,
+                max_seq_length=args.seq_length,
+                short_seq_prob=args.short_seq_prob,
+                seed=args.seed)
+            print_rank_0("Finished creating AlbertDataset...")
            split = split_dataset.get_split(args)
            if split_dataset.should_split(split):
                train_ds, val_ds, test_ds = split_dataset.split_ds(full_data, split, args.shuffle)