Merge block-dataset into ict-stable

8ba76558 · Neel Kant · 1a3f5663 · 3dc39c4a · 8ba76558 · 8ba76558
Commit 8ba76558 authored Apr 14, 2020 by Neel Kant
6 changed files
--- a/megatron/data/bert_dataset.py
+++ b/megatron/data/bert_dataset.py
@@ -42,6 +42,11 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                           data_impl,
                                           skip_warmup)

+    if ict_dataset:
+        titles_dataset = get_indexed_dataset_(data_prefix + '-titles',
+                                              data_impl,
+                                              skip_warmup)
+
    # Get start and end indices of train/valid/train into doc-idx
    # Note that doc-idx is desinged to be num-docs + 1 so we can
    # easily iterate over it.
@@ -78,7 +83,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
            # Build the dataset accordingly.
            kwargs = dict(
                name=name,
-                indexed_dataset=indexed_dataset,
+                context_dataset=indexed_dataset,
                data_prefix=data_prefix,
                num_epochs=None,
                max_num_samples=train_valid_test_num_samples[index],
@@ -88,7 +93,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
            )

            if ict_dataset:
-                dataset = InverseClozeDataset(**kwargs)
+                dataset = InverseClozeDataset(titles_dataset=titles_dataset, **kwargs)
            else:
                dataset = BertDataset(masked_lm_prob=masked_lm_prob, **kwargs)
            # Set the original pointer so dataset remains the main dataset.

--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -304,7 +304,211 @@ py::array build_mapping(const py::array_t<int64_t>& docs_,
    }
 }

+template<typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
+                                    const py::array_t<int32_t>& sizes_,
+                                    const py::array_t<int32_t>& titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+    auto titles_sizes = titles_sizes_.unchecked<1>();
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+        const auto sent_end_index = docs[docs_.shape(0) - 1];
+        const auto num_sentences = sent_end_index - sent_start_index;
+        cout << "    using:" << endl << std::flush;
+        cout << "     number of documents:            " << docs_.shape(0) - 1 <<
+          endl << std::flush;
+        cout << "     sentences range:                [" << sent_start_index <<
+        ", " << sent_end_index << ")" << endl << std::flush;
+        cout << "     total number of sentences:      " << num_sentences <<
+          endl << std::flush;
+        cout << "     number of epochs:               " << num_epochs <<
+          endl << std::flush;
+        cout << "     maximum number of samples:      " << max_num_samples <<
+          endl << std::flush;
+        cout << "     maximum sequence length:        " << max_seq_length <<
+          endl << std::flush;
+        cout << "     seed:                           " << seed << endl <<
+          std::flush;
+    }
+
+    // Mapping and its length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration=0; iteration<2; ++iteration) {
+
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        // For each epoch:
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            if (map_index >= max_num_samples) {
+                if (verbose && (!second)) {
+                cout << "    reached " << max_num_samples << " samples after "
+                     << epoch << " epochs ..." << endl << std::flush;
+                }
+                break;
+            }
+            // For each document:
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
+
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+                const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+                // At the begining of the document previous index is the
+                // start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Detect documents with long sentences.
+                bool contains_long_sentence = false;
+                if (num_remain_sent > 1) {
+                    for (auto sent_index=sent_index_first;
+                    sent_index < sent_index_last; ++sent_index) {
+                        if (sizes[sent_index] > LONG_SENTENCE_LEN){
+                            contains_long_sentence = true;
+                            break;
+                        }
+                    }
+                }
+                // If we have more than two sentences.
+                if ((num_remain_sent > 1) && (!contains_long_sentence)) {
+
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+
+                    // Loop through sentences.
+                    for (auto sent_index=sent_index_first;
+                         sent_index < sent_index_last; ++sent_index) {
+
+                            // Add the size and number of sentences.
+                            seq_len += sizes[sent_index];
+                            ++num_sent;
+                            --num_remain_sent;
+
+                        // If we have reached the target length.
+                        // and if not only one sentence is left in the document.
+                        // and if we have at least two sentneces.
+                        // or if we have reached end of the document.
+                        if (((seq_len >= target_seq_len) &&
+                             (num_remain_sent > 1) &&
+                             (num_sent > 1) ) || (num_remain_sent == 0)) {
+
+                            // Populate the map.
+                            if (second) {
+                                const auto map_index_0 = 3 * map_index;
+                                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                            }
+
+                            // Update indices / counters.
+                            ++map_index;
+                            prev_start_index = sent_index + 1;
+                            seq_len = 0;
+                            num_sent = 0;
+                        }
+                    } // for (auto sent_index=sent_index_first; ...
+                } // if (num_remain_sent > 1) {
+            } // for (int doc=0; doc < num_docs; ++doc) {
+        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+            if (verbose) {
+            cout << "   will create mapping for " << map_index <<
+              " samples" << endl << std::flush;
+            }
+            assert(maps == NULL);
+            assert(num_samples < 0);
+            maps = new DocIdx[3*map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    } // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i=(num_samples - 1); i > 0; --i) {
+        const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+        const auto i0 = 3 * i;
+        const auto j0 = 3 * j;
+        // Swap values.
+        swap(maps[i0], maps[j0]);
+        swap(maps[i0 + 1], maps[j0 + 1]);
+        swap(maps[i0 + 2], maps[j0 + 2]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void *mem_) {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem;
+        });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                     {3*byte_size, byte_size}, // C-style contiguous strides
+                     maps, // the data pointer
+                     free_when_done); // numpy array references
+
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
+                               const py::array_t<int>& sizes_,
+                               const py::array_t<int>& titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                    const bool verbose) {
+
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) {
+	   cout << "    using uint64 for data mapping..." << endl << std::flush;
+	}
+	return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
+	                    num_epochs, max_num_samples, max_seq_length, seed, verbose);
+    } else {
+       if (verbose) {
+	   cout << "    using uint32 for data mapping..." << endl << std::flush;
+       }
+       return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
+                        num_epochs, max_num_samples, max_seq_length, seed, verbose);
+    }
+}

 PYBIND11_MODULE(helpers, m) {
    m.def("build_mapping", &build_mapping);
+    m.def("build_blocks_mapping", &build_blocks_mapping);
 }
--- a/megatron/data/ict_dataset.py
+++ b/megatron/data/ict_dataset.py
+import itertools
 import random
 import os
+import sys
 import time

 import numpy as np
@@ -11,17 +13,28 @@ from megatron import print_rank_0
 from megatron import mpu
 from megatron.data import helpers

+
 class InverseClozeDataset(Dataset):
-    """Dataset containing sentences and various 'blocks' for an inverse cloze task."""
-    def __init__(self, name, indexed_dataset, data_prefix,
+    """Dataset containing sentences and their blocks for an inverse cloze task."""
+    def __init__(self, name, context_dataset, titles_dataset, data_prefix,
                 num_epochs, max_num_samples, max_seq_length,
                 short_seq_prob, seed):
        self.name = name
        self.seed = seed
        self.max_seq_length = max_seq_length
-        self.indexed_dataset = indexed_dataset
+        self.context_dataset = context_dataset
+        self.titles_dataset = titles_dataset
        self.short_seq_prob = short_seq_prob
-
+        self.rng = random.Random(self.seed)
+
+        self.samples_mapping = get_samples_mapping(self.context_dataset,
+                                                   self.titles_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   self.max_seq_length,
+                                                   self.seed,
+                                                   self.name)
        tokenizer = get_tokenizer()
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_list = tokenizer.inv_vocab
@@ -29,23 +42,35 @@ class InverseClozeDataset(Dataset):
        self.sep_id = tokenizer.sep
        self.mask_id = tokenizer.mask
        self.pad_id = tokenizer.pad
-        self.offset = 0

    def __len__(self):
-        return self.indexed_dataset.doc_idx.shape[0]
+        return self.samples_mapping.shape[0]

    def __getitem__(self, idx):
-        # get rng state corresponding to index (allows deterministic random pair)
-        rng = random.Random(idx + 20000 + self.seed)
+        start_idx, end_idx, doc_idx = self.samples_mapping[idx]
+        title = list(self.titles_dataset[int(doc_idx)])
+        context = [list(self.context_dataset[i]) for i in range(start_idx, end_idx)]
+        assert len(context) > 1
+
+        # avoid selecting the first or last sentence to be the query.
+        if len(context) == 2:
+            rand_sent_idx = int(self.rng.random() > 0.5)
+        else:
+            rand_sent_idx = self.rng.randint(1, len(context) - 2)
+
+        # keep the query in the context 10% of the time.
+        if self.rng.random() < 0.1:
+            input = context[rand_sent_idx].copy()
+        else:
+            input = context.pop(rand_sent_idx)

-        # get seq length. Save 2 tokens for beginning and end
-        target_seq_length = self.max_seq_length - 2
-        if rng.random() < self.short_seq_prob:
-            target_seq_length = rng.randint(5, target_seq_length)
+        # may still need to truncate because blocks are concluded when
+        # the sentence lengths have exceeded max_seq_length.
+        input = input[:self.max_seq_length - 2]
+        context = list(itertools.chain(*context))[:self.max_seq_length - (3 + len(title))]

-        input_data, context_data = self.get_input_and_context(idx, target_seq_length, rng)
-        input_tokens, input_token_types, input_pad_mask = input_data
-        context_tokens, context_token_types, context_pad_mask = context_data
+        input_tokens, input_token_types, input_pad_mask = self.concat_and_pad_tokens(input)
+        context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens(context, title)

        sample = {
            'input_text': np.array(input_tokens),
@@ -58,20 +83,12 @@ class InverseClozeDataset(Dataset):

        return sample

-    def get_sentence_split_doc(self, idx):
-        """fetch document at index idx and split into sentences"""
-        doc_start = self.indexed_dataset.doc_idx[idx]
-        doc_end = self.indexed_dataset.doc_idx[idx + 1]
-
-        doc_sentences_array = self.indexed_dataset[doc_start:doc_end]
-        doc_sentences = [list(arr) for arr in doc_sentences_array]
-
-        return doc_sentences
-
-    def concat_and_pad_tokens(self, tokens):
+    def concat_and_pad_tokens(self, tokens, title=None):
        """concat with special tokens and pad sequence to self.max_seq_length"""
        tokens = [self.cls_id] + tokens + [self.sep_id]
-        assert len(tokens) <= self.max_seq_length
+        if title is not None:
+            tokens += title + [self.sep_id]
+        assert len(tokens) <= self.max_seq_length, len(tokens)

        num_pad = self.max_seq_length - len(tokens)
        pad_mask = [0] * len(tokens) + [1] * num_pad
@@ -79,65 +96,82 @@ class InverseClozeDataset(Dataset):
        token_types = [0] * self.max_seq_length
        return tokens, token_types, pad_mask

-    def get_input_and_context(self, idx, target_seq_length, rng):
-        """fetches a sentence and its surrounding context"""
-        num_tries = 0
-        while num_tries < 20:
-            num_tries += 1
-            doc = None
-            while doc is None:
-                doc = self.get_sentence_split_doc(idx + self.offset)
-                if not doc:
-                    doc = None
-                    self.offset += 1
-
-            num_sentences = len(doc)
-            padless_max_len = self.max_seq_length - 2
-
-            # select a random sentence from the document as input
-            # TODO: consider adding multiple input sentences.
-            input_sentence_idx = rng.randint(0, num_sentences - 1)
-            input_tokens = doc[input_sentence_idx][:target_seq_length]
-            if not len(input_tokens) > 0:
-                self.offset += 1
-                continue
-
-            context_tokens = []
-            # 10% of the time, the input sentence is left in the context.
-            # The other 90% of the time, keep it out.
-            if rng.random() < 0.1:
-                context_tokens = input_tokens.copy()
-
-            view_preceding = True
-            view_radius = 1
-            while len(context_tokens) < padless_max_len:
-                # keep adding sentences while the context can accommodate more.
-                if view_preceding:
-                    examine_idx = input_sentence_idx - view_radius
-                    if examine_idx >= 0:
-                        new_tokens = doc[examine_idx]
-                        context_tokens = new_tokens + context_tokens
-                else:
-                    examine_idx = input_sentence_idx + view_radius
-                    if examine_idx < num_sentences:
-                        new_tokens = doc[examine_idx]
-                        context_tokens += new_tokens
-                    view_radius += 1
-                view_preceding = not view_preceding
-                if view_radius > num_sentences:
-                    break
-
-            # assemble the tokens and token types of the context
-            context_tokens = context_tokens[:padless_max_len]
-            if not len(context_tokens) > 0:
-                self.offset += 1
-                continue
-
-            # concatenate 'CLS' and 'SEP' tokens and add extra token types
-            input_tokens, input_token_types, input_pad_mask = self.concat_and_pad_tokens(input_tokens)
-            context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens(context_tokens)
-
-            return (input_tokens, input_token_types, input_pad_mask), \
-                   (context_tokens, context_token_types, context_pad_mask)
-        else:
-            raise RuntimeError("Could not get a valid data point from InverseClozeDataset")
+
+def get_samples_mapping(context_dataset,
+                        titles_dataset,
+                        data_prefix,
+                        num_epochs,
+                        max_num_samples,
+                        max_seq_length,
+                        seed,
+                        name):
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{}s'.format(seed)
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0 and \
+            not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+
+        # Make sure the types match the helpers input types.
+        assert context_dataset.doc_idx.dtype == np.int64
+        assert context_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building samples index mapping for {} ...'.format(
+            name))
+        samples_mapping = helpers.build_blocks_mapping(
+            context_dataset.doc_idx,
+            context_dataset.sizes,
+            titles_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length-3,  # account for added tokens
+            seed,
+            verbose)
+        print_rank_0(' > done building samples index mapping')
+        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elapsed time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+            time.time() - start_time))
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+    samples_mapping = np.load(indexmap_filename, allow_pickle=True)
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        samples_mapping.shape[0]))
+
+    return samples_mapping
--- a/megatron/data/preprocess_data.py
+++ b/megatron/data/preprocess_data.py
 import argparse
+import itertools
 import json
 import multiprocessing
 import nltk
@@ -43,18 +44,28 @@ class Encoder(object):

    def encode(self, json_line):
        text = json.loads(json_line)[self.args.json_key]
+        if not text:
+            text = "no text"
        doc_ids = []
        for sentence in Encoder.splitter.tokenize(text):
            tokens = Encoder.tokenizer.tokenize(sentence)
            ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
            if len(ids) > 0:
                doc_ids.append(ids)
+            else:
+                print("no ids!", flush=True)
+                tokens = Encoder.tokenizer.tokenize("no text")
+                ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
+                doc_ids.append(ids)
+        if self.args.flatten and len(doc_ids) > 1:
+            doc_ids = [list(itertools.chain(*doc_ids))]
        return doc_ids, len(json_line)

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, help='Path to input JSON')
    parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
+    parser.add_argument('--flatten', action='store_true', help='Path to input JSON')
    parser.add_argument('--json-key', type=str, default='text',
                        help='Key to extract from json')
    parser.add_argument('--output-prefix', type=str, help='Path to binary output file without suffix')

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -24,7 +24,7 @@ from megatron import get_adlr_autoresume
 from megatron import mpu
 from megatron import print_rank_0
 from megatron.checkpointing import save_checkpoint
-from megatron.data.samplers import DistributedBatchSampler, RandomSampler
+from megatron.data.samplers import DistributedBatchSampler
 from megatron.fp16 import FP16_Optimizer


@@ -102,16 +102,12 @@ def make_data_loader(dataset):
    num_workers = args.num_workers

    # Use a simple sampler with distributed batch sampler.
-    #sampler = torch.utils.data.SequentialSampler(dataset)
-    sampler = RandomSampler(dataset,
-                            replacement=True,
-                            num_samples=global_batch_size*args.train_iters)
+    sampler = torch.utils.data.SequentialSampler(dataset)
    batch_sampler = DistributedBatchSampler(sampler=sampler,
                                            batch_size=global_batch_size,
                                            drop_last=True,
                                            rank=rank,
-                                            world_size=world_size,
-                                            wrap_last=True)
+                                            world_size=world_size)
    # Torch dataloader.
    return torch.utils.data.DataLoader(dataset,
                                       batch_sampler=batch_sampler,

--- a/pretrain_bert_ict.py
+++ b/pretrain_bert_ict.py
@@ -102,7 +102,7 @@ def get_train_val_test_data():
    """Load the data on rank zero and boradcast number of tokens to all GPUS."""
    args = get_args()

-    (train_data, val_data, test_data) = (None, None, None)
+    (train_data, valid_data, test_data) = (None, None, None)

    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
@@ -115,7 +115,7 @@ def get_train_val_test_data():

        # Number of train/valid/test samples.
        train_iters = args.train_iters
-        eval_iters = args.eval_iters
+        eval_iters = (train_iters // args.eval_iters + 1) * args.eval_iters
        test_iters = args.eval_iters
        train_val_test_num_samples = [train_iters * global_batch_size,
                                      eval_iters * global_batch_size,