fixed couple of bugs in helpers

690291dd · Mohammad Shoeybi · 1a1cfcff · 690291dd · 690291dd · 690291dd
Commit 690291dd authored Nov 20, 2019 by Mohammad Shoeybi
Showing with 194 additions and 116 deletions

megatron/data/albert_dataset.py megatron/data/albert_dataset.py +27 -10

megatron/data/helpers.cpp megatron/data/helpers.cpp +157 -97

megatron/data/indexed_dataset.py megatron/data/indexed_dataset.py +10 -9

No files found.
--- a/megatron/data/albert_dataset.py
+++ b/megatron/data/albert_dataset.py
@@ -12,6 +12,7 @@ from .dataset_utils import build_training_sample

 from . import helpers
 from megatron.data import FullBertTokenizer, indexed_dataset
+from megatron.utils import print_rank_0


 class AlbertDataset(Dataset):
@@ -31,11 +32,19 @@ class AlbertDataset(Dataset):
        # Build the samples mapping.
        if not num_epochs:
            if not max_num_samples:
-                raise ValueError("Need to specify either max_num_samples or num_epochs")
-            num_epochs = int(max_num_samples / len(indexed_dataset)) + 1
+                raise ValueError("Need to specify either max_num_samples "
+                                 "or num_epochs")
+            num_epochs = np.iinfo(np.int32).max - 1
        if not max_num_samples:
-            max_num_samples = len(indexed_dataset) * num_epochs
-        print(f"Building the sample map for {num_epochs} epochs or {max_num_samples} samples.")
+            max_num_samples = np.iinfo(np.int64).max - 1
+
+        # Make sure the types match the helpers input types.
+        assert indexed_dataset.doc_idx.dtype == np.int64
+        assert indexed_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank()==0
+        start_time = time.time()
        self.samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,
            indexed_dataset.sizes,
@@ -43,7 +52,14 @@ class AlbertDataset(Dataset):
            max_num_samples,
            self.max_seq_length-3, # account for added tokens
            short_seq_prob,
-            self.seed)
+            self.seed,
+            verbose)
+        # Make sure all the ranks have built the mapping
+        torch.distributed.barrier()
+        print_rank_0('> elasped time to build samples mapping (seconds): '
+                     '{:2f}'.format(time.time() - start_time))
+
+        exit()

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
@@ -59,11 +75,12 @@ class AlbertDataset(Dataset):
                   num_epochs, max_num_samples, masked_lm_prob,
                   max_seq_length, short_seq_prob, seed, skip_warmup=False):
        tokenizer = FullBertTokenizer(vocab, do_lower_case=True)
-        print("> Reading dataset index")
-        idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl, skip_warmup)
-        print("> Finished creating indexed dataset")
-        return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob,
-                   max_seq_length, short_seq_prob, seed)
+        print_rank_0("> Reading dataset index ...")
+        idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl,
+                                              skip_warmup)
+        print_rank_0("> Finished creating indexed dataset")
+        return cls(idx_ds, tokenizer, num_epochs, max_num_samples,
+                   masked_lm_prob, max_seq_length, short_seq_prob, seed)

    def num_tokens(self):
        return self.tokenizer.vocab_size()

--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
+/* Helper methods for fast index mapping builds */

 #include <algorithm>
 #include <iostream>
@@ -6,46 +7,61 @@
 #include <stdexcept>
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
+#include <random>

 namespace py = pybind11;
 using namespace std;


-inline uint32_t get_sample_len(const int short_seq_ratio,
-                               const uint32_t max_length) {
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+				     const int32_t max_length,
+				     std::mt19937& rand32_gen) {
    /* Training sample length. */
-    const auto random_number = rand();
+    const auto random_number = rand32_gen();
    if ((random_number % short_seq_ratio) == 0) {
-        return 2 + random_number % (max_length - 1);
+      return 2 + random_number % (max_length - 1);
    }
    return max_length;
 }

+
 template<typename DocIdx>
-py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
-                             const py::array_t<uint16_t>& sizes_,
-                             const int num_epochs,
+py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
+                             const py::array_t<int32_t>& sizes_,
+                             const int32_t num_epochs,
                             const uint64_t max_num_samples,
-                             const int max_seq_length,
+                             const int32_t max_seq_length,
                             const double short_seq_prob,
-                             const int seed) {
+                             const int32_t seed,
+			     const bool verbose) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    if (verbose) {
+         cout << " > using " << docs_.shape(0) - 1 <<
+	   " documents with " << sizes_.shape(0) << " sentences ..." <<
+	   endl << std::flush;
+    }

-    cout << "> building dataset mapping for " << docs_.shape(0) - 1\
-         << " documents with " << sizes_.shape(0) << " sentences ..."
-         << std::flush << endl;
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(short_seq_prob > 0.0);
+    assert(short_seq_prob <= 1.0);
+    assert(seed > 0);

-    // For efficiency, convert probability to ratio.
-    const auto short_seq_ratio = static_cast<int>(round(1.0 / short_seq_prob));
+    // For efficiency, convert probability to ratio. Note: rand() generates int.
+    const auto short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));

    // Remove bound checks.
    auto docs = docs_.unchecked<1>();
    auto sizes = sizes_.unchecked<1>();
-
-    // Check for consistency.
    if (docs[docs.shape(0) - 1] != sizes.shape(0)) {
        cout << "document values is not consistent with length of sizes: " <<
                docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl;
-        throw(-1);
+        throw std::length_error("docs and sizes");
    }

    // Mapping and it's length (1D).
@@ -55,36 +71,39 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
    // Perform two iterations, in the first iteration get the size
    // and allocate memory and in the second iteration populate the map.
    bool second = false;
-    for (int iteration=0; iteration < 2; ++iteration) {
+    for (int32_t iteration=0; iteration<2; ++iteration) {

        // Set the seed so both iterations produce the same results.
-        srand(seed);
+        std::mt19937 rand32_gen(seed);

        // Set the flag on second iteration.
-        second = iteration == 1;
+        second = (iteration == 1);

        // Counters:
-        uint32_t empty_docs = 0;
-        uint32_t one_sent_docs = 0;
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;

        // Current map index.
        uint64_t map_index = 0;

        // For each epoch:
-        for (int epoch=0; epoch < num_epochs; ++epoch) {
-            if (map_index >= max_num_samples && !second) {
-                cout << " > reached " << max_num_samples << " samples after "
-                     << epoch << " epochs ..." << std::flush << endl;
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            if (map_index >= max_num_samples) {
+	        if (verbose && (!second)) {
+		  cout << " > reached " << max_num_samples << " samples after "
+		       << epoch << " epochs ..." << endl << std::flush;
+		}
                break;
            }
            // For each document:
-            for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {

-                // Document sentences are in [sent_index_first, sent_index_last).
+                // Document sentences are in [sent_index_first, sent_index_last)
                const auto sent_index_first = docs[doc];
                const auto sent_index_last = docs[doc + 1];

-                // At the begining of the document previous index is the start index.
+                // At the begining of the document previous index is the
+		// start index.
                auto prev_start_index = sent_index_first;

                // Remaining documents.
@@ -93,13 +112,10 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
                // Some bookkeeping
                if ((epoch == 0) && (!second)) {
                    if (num_remain_sent == 0) {
-                        cout << "***WARNING*** document " << doc << " is empty" << endl;
-                        empty_docs += 1;
+		        ++empty_docs;
                    }
                    if (num_remain_sent == 1) {
-                        // cout << "***WARNING*** document " << doc <<
-                        //         " has one sentence" << endl;
-                        one_sent_docs += 1;
+		        ++one_sent_docs;
                    }
                }

@@ -107,110 +123,154 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
                if (num_remain_sent > 1) {

                    // Set values.
-                    auto size = uint32_t{0};
-                    auto num_sent = uint32_t{0};
-                    auto seq_len = get_sample_len(short_seq_ratio, max_seq_length);
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+                    auto target_seq_len = get_target_sample_len(short_seq_ratio,
+								max_seq_length,
+								rand32_gen);

                    // Loop through sentences.
                    for (auto sent_index=sent_index_first;
                         sent_index < sent_index_last; ++sent_index) {

-                        // Add the size and number of sentences.
-                        size += sizes[sent_index];
-                        num_sent += 1;
-                        num_remain_sent -= 1;
-
-                        // If we have reached the target length.
-                        // and if not only one sentence is left in the document.
-                        // and if we have at least two sentneces.
-                        // and if we have reached end of the document.
-                        if (((size >= seq_len) && (num_remain_sent > 1) &&
-                             (num_sent > 1) ) || (num_remain_sent == 0)) {
-
-                            // Populate the map.
-                            if (second) {
-                                const auto map_index_0 = 3 * map_index;
-                                maps[map_index_0] = prev_start_index;
-                                maps[map_index_0 + 1] = sent_index + 1;
-                                maps[map_index_0 + 2] = seq_len;
-                            }
-
-                            // Update indices / counters.
-                            // check for overflow
-                            if (map_index == std::numeric_limits<DocIdx>::max()) {
-                                cout << "number of samples exceeded maximum allowed by type: "
-                                     << std::numeric_limits<DocIdx>::max() << endl;
-                                throw std::overflow_error("Number of samples");
-                            }
-                            map_index += 1;
-                            prev_start_index = sent_index + 1;
-                            seq_len = get_sample_len(short_seq_ratio, max_seq_length);
-                            size = 0;
-                            num_sent = 0;
-                        }
-                    }
-
+		        // Add the size and number of sentences.
+		        seq_len += sizes[sent_index];
+		        ++num_sent;
+			--num_remain_sent;
+
+			// If we have reached the target length.
+			// and if not only one sentence is left in the document.
+			// and if we have at least two sentneces.
+			// and if we have reached end of the document.
+			if (((seq_len >= target_seq_len) &&
+			     (num_remain_sent > 1) &&
+			     (num_sent > 1) ) || (num_remain_sent == 0)) {
+
+			    // Check for overflow.
+			    if ((3 * map_index + 2) >
+				std::numeric_limits<int64_t>::max()) {
+			        cout << "number of samples exceeded maximum "
+				     << "allowed by type int64: "
+				     << std::numeric_limits<int64_t>::max()
+				     << endl;
+				throw std::overflow_error("Number of samples");
+			    }
+
+			    // Populate the map.
+			    if (second) {
+			        const auto map_index_0 = 3 * map_index;
+				maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+				maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+				maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+			    }
+
+			    // Update indices / counters.
+			    ++map_index;
+			    prev_start_index = sent_index + 1;
+			    target_seq_len = get_target_sample_len(short_seq_ratio,
+								   max_seq_length,
+								   rand32_gen);
+			    seq_len = 0;
+			    num_sent = 0;
+			}
+
+                    } // for (auto sent_index=sent_index_first; ...
                } // if (num_remain_sent > 1) {
            } // for (int doc=0; doc < num_docs; ++doc) {
        } // for (int epoch=0; epoch < num_epochs; ++epoch) {

        if (!second) {
-            cout << "    number of samples:                      " <<
-                    map_index << endl;
-            cout << "    number of empty documents:              " <<
-                    empty_docs << endl;
-            cout << "    number of documents with one sentence:  " <<
-                    one_sent_docs << endl;
+	    if (verbose) {
+	        cout << " > number of empty documents: " << empty_docs <<
+		  endl << std::flush;
+		cout << " > number of documents with one sentence: " <<
+		  one_sent_docs << endl << std::flush;
+		cout << " > will create mapping for " << map_index <<
+		  " samples" << endl << std::flush;
+	    }
+	    assert(maps == NULL);
+	    assert(num_samples < 0);
            maps = new DocIdx[3*map_index];
-            num_samples = map_index;
+            num_samples = static_cast<int64_t>(map_index);
        }

    } // for (int iteration=0; iteration < 2; ++iteration) {

    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
    for (auto i=(num_samples - 1); i > 0; --i) {
-        const auto j = rand() % (i + 1);
-        const auto i0 = 3 * i;
-        const auto j0 = 3 * j;
-        // Swap values.
-        swap(maps[i0], maps[j0]);
-        swap(maps[i0 + 1], maps[j0 + 1]);
-        swap(maps[i0 + 2], maps[j0 + 2]);
+      const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+      const auto i0 = 3 * i;
+      const auto j0 = 3 * j;
+      // Swap values.
+      swap(maps[i0], maps[j0]);
+      swap(maps[i0 + 1], maps[j0 + 1]);
+      swap(maps[i0 + 2], maps[j0 + 2]);
    }

-    cout << " > done building the mapping." << endl;
+    if (verbose) {
+        cout << "> done building the mapping." << endl;
+    }

    // Method to deallocate memory.
    py::capsule free_when_done(maps, [](void *mem_) {
            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-            cout << "freeing memory for the dataset mapping" << endl;
-            delete[] mem;
+	    delete[] mem;
        });

    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
-                     {3*4, 4}, // C-style contiguous strides
+                     {3*byte_size, byte_size}, // C-style contiguous strides
                     maps, // the data pointer
                     free_when_done); // numpy array references

 }

-py::array build_mapping(const py::array& docs_,
-                        const py::array& sizes_,
+
+py::array build_mapping(const py::array_t<int64_t>& docs_,
+                        const py::array_t<int>& sizes_,
                        const int num_epochs,
                        const uint64_t max_num_samples,
                        const int max_seq_length,
                        const double short_seq_prob,
-                        const int seed) {
+                        const int seed,
+			const bool verbose) {
+
+    if (verbose) {
+        cout << "> building sample map using: " << endl << std::flush;
+	cout << "     number of epochs:           " << num_epochs << endl
+	     << std::flush;
+	cout << "     maximum number of samples:  " << max_num_samples << endl
+	     << std::flush;
+	cout << "     maximum sequence length:    " << max_seq_length << endl
+	     << std::flush;
+	cout << "     short sequence probability: " << short_seq_prob << endl
+	     << std::flush;
+	cout << "     seed:                       " << seed << endl
+	     << std::flush;
+    }
+
    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
-        return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples,
-                                            max_seq_length, short_seq_prob, seed);
+        if (verbose) {
+	    cout << " > using uint64 for data mapping..." << endl << std::flush;
+        }
+        return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+					    max_num_samples, max_seq_length,
+					    short_seq_prob, seed, verbose);
    } else {
-        return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples,
-                                            max_seq_length, short_seq_prob, seed);
+        if (verbose) {
+	    cout << " > using uint32 for data mapping..." << endl << std::flush;
+        }
+        return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+					    max_num_samples, max_seq_length,
+					    short_seq_prob, seed, verbose);
    }
 }

+
 PYBIND11_MODULE(helpers, m) {
    m.def("build_mapping", &build_mapping);
 }
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -18,6 +18,7 @@ from itertools import accumulate

 import numpy as np
 import torch
+from megatron.utils import print_rank_0

 def __best_fitting_dtype(vocab_size=None):
    if vocab_size is not None and vocab_size < 65500:
@@ -317,7 +318,7 @@ class IndexedDatasetBuilder(object):

 def _warmup_mmap_file(path):
    with open(path, 'rb') as stream:
-        while stream.read(1 * 1024 * 1024):
+        while stream.read(100 * 1024 * 1024):
            pass


@@ -390,17 +391,17 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
                offset = stream.tell()

            if not skip_warmup:
-                print(">    Warming up index mmap file...")
+                print_rank_0(">    Warming up index mmap file...")
                _warmup_mmap_file(path)

            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
            self._bin_buffer = memoryview(self._bin_buffer_mmap)
-            print(">    Reading sizes...")
+            print_rank_0(">    Reading sizes...")
            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
-            print(">    Reading pointers...")
+            print_rank_0(">    Reading pointers...")
            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
                                           offset=offset + self._sizes.nbytes)
-            print(">    Reading document index...")
+            print_rank_0(">    Reading document index...")
            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
        def __del__(self):
@@ -446,13 +447,13 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
        self._index = self.Index(index_file_path(self._path), skip_warmup)

        if not skip_warmup:
-            print(">    Warming up data mmap file...")
+            print_rank_0(">    Warming up data mmap file...")
            _warmup_mmap_file(data_file_path(self._path))
-        print(">    Creating numpy buffer of mmap...")
+        print_rank_0(">    Creating numpy buffer of mmap...")
        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
-        print(">    Creating memory view of numpy buffer...")
+        print_rank_0(">    Creating memory view of numpy buffer...")
        self._bin_buffer = memoryview(self._bin_buffer_mmap)
-        print(">    Done")
+        print_rank_0(">    Done")

    def __del__(self):
        self._bin_buffer_mmap._mmap.close()