Plugging new dataset stuff together.

Some cleanup of types, etc. in the helpers cpp code.

Plugging new dataset stuff together.
Some cleanup of types, etc. in the helpers cpp code.
0058b1a5 · Jared Casper · b1714c14 · 0058b1a5 · 0058b1a5 · 0058b1a5
Commit 0058b1a5 authored Nov 13, 2019 by Jared Casper
7 changed files
--- a/megatron/data/Makefile
+++ b/megatron/data/Makefile
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
--- a/megatron/data/__init__.py
+++ b/megatron/data/__init__.py
 from . import indexed_dataset
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .dataset import AlbertDataset
--- a/megatron/data/dataset.py
+++ b/megatron/data/dataset.py
@@ -7,27 +7,36 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset

-from dataset_utils import build_training_sample
+from .dataset_utils import build_training_sample
 #from data.mapping import build_training_samples_mapping

-class AlbertDataSet(Dataset):
+from . import helpers
+from megatron.data import FullBertTokenizer, indexed_dataset

-    def __init__(self, indexed_dataset, tokenizer, num_epochs,
+
+class AlbertDataset(Dataset):
+
+    def __init__(self, indexed_dataset, tokenizer, num_epochs, max_num_samples,
                 masked_lm_prob, max_seq_length, short_seq_prob, seed):

        # Params to store.
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
+        self.tokenizer = tokenizer

        # Indexed dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
-        self.samples_mapping = build_training_samples_mapping(
-            indexed_dataset,
+        if not max_num_samples:
+            max_num_samples = len(indexed_dataset) * num_epochs
+        self.samples_mapping = helpers.build_mapping(
+            indexed_dataset.doc_idx,
+            indexed_dataset.sizes,
            num_epochs,
-            self.max_seq_length,
+            max_num_samples,
+            self.max_seq_length-3, # account for added tokens
            short_seq_prob,
            self.seed)

@@ -40,8 +49,17 @@ class AlbertDataSet(Dataset):
        self.pad_id = tokenizer.vocab['[PAD]']


+    @classmethod
+    def from_paths(cls, vocab, data_prefix, data_impl,
+                   num_epochs, max_num_samples, masked_lm_prob,
+                   max_seq_length, short_seq_prob, seed):
+        tokenizer = FullBertTokenizer(vocab, do_lower_case=True)
+        idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl)
+        return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob,
+                   max_seq_length, short_seq_prob, seed)
+
    def __len__(self):
-        return self.samples.shape[0]
+        return self.samples_mapping.shape[0]

    def __getitem__(self, idx):
        rng = random.Random(self.seed + idx)
@@ -49,6 +67,9 @@ class AlbertDataSet(Dataset):
        sample = []
        for index in range(start_index, end_index):
            sample.append(self.indexed_dataset[index])
+        for s in sample:
+            if len(s) > 1000:
+                print(self.tokenizer.convert_ids_to_tokens(s))
        return build_training_sample(sample, seq_length,
                                     self.max_seq_length,
                                     self.vocab_id_list,
@@ -186,7 +207,6 @@ class JaredDataset(object):


 if __name__ == '__main__':
-
    print('dataset ...')

    from bert_tokenization import FullTokenizer
@@ -207,8 +227,8 @@ if __name__ == '__main__':
                            sentences.extend(sent)
                yield sentences

-    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
-    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
+    input_file = 'test/samples_10000.json'
+    vocab_file = 'test/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    document_generator = document_generator_provider(input_file)

--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -35,9 +35,8 @@ def build_training_sample(sample,
    tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)

    # Truncate to `target_sequence_length`.
-    # Note that we have account for [CLS] A [SEP] B [SEP]
-    max_num_tokens = target_seq_length - 3
-    truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
+    max_num_tokens = target_seq_length
+    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
                                  max_num_tokens, rng)

    # Build tokens and toketypes.
@@ -48,7 +47,7 @@ def build_training_sample(sample,
    max_predictions_per_seq = masked_lm_prob * max_num_tokens
    (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
-        cls_id, sep_id, mask_id, max_predictions_per_seq)
+        cls_id, sep_id, mask_id, max_predictions_per_seq, rng)

    # Padding.
    tokens_np, tokentypes_np, labels, padding_mask, loss_mask \
@@ -61,7 +60,8 @@ def build_training_sample(sample,
        'labels': labels,
        'is_random': int(is_next_random),
        'loss_mask': loss_mask,
-        'padding_mask': padding_mask}
+        'padding_mask': padding_mask,
+        'truncated': int(truncated)}
    return train_sample


@@ -99,11 +99,12 @@ def get_a_and_b_segments(sample, rng):

 def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
    """Truncates a pair of sequences to a maximum sequence length."""
+    #print(len_a, len_b, max_num_tokens)
    assert len_a > 0
    assert len_b > 0
-    if (len_a + len_b) <= max_num_tokens:
-        return
-    else:
+    if len_a + len_b <= max_num_tokens:
+        return False
+    while len_a + len_b > max_num_tokens:
        if len_a > len_b:
            len_a -= 1
            tokens = tokens_a
@@ -114,8 +115,7 @@ def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
            del tokens[0]
        else:
            tokens.pop()
-        truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng)
-
+    return True

 def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
@@ -161,6 +161,7 @@ def create_masked_lm_predictions(tokens,
                                 masked_lm_prob,
                                 cls_id, sep_id, mask_id,
                                 max_predictions_per_seq,
+                                 rng,
                                 max_ngrams=3,
                                 do_whole_word_mask=True,
                                 favor_longer_ngram=False,
@@ -468,4 +469,3 @@ if __name__ == '__main__':
        string += '{:5d}'.format(tokentype)
        string += '{:5d}'.format(padding_mask)
        print(string)
-
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <limits>
 #include <math.h>
+#include <stdexcept>
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>

@@ -20,11 +21,11 @@ inline uint32_t get_sample_len(const int short_seq_ratio,
    return max_length;
 }

-
-py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
+template<typename DocIdx>
+py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
                             const py::array_t<uint16_t>& sizes_,
                             const int num_epochs,
-				    const int max_num_samples,
+                             const uint64_t max_num_samples,
                             const int max_seq_length,
                             const double short_seq_prob,
                             const int seed) {
@@ -33,7 +34,7 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
            " documents with " << sizes_.shape(0) << " sentences ..." << endl;

    // For efficiency, convert probability to ratio.
-  const int short_seq_ratio = int(round(1.0 / short_seq_prob));
+    const auto short_seq_ratio = static_cast<int>(round(1.0 / short_seq_prob));

    // Remove bound checks.
    auto docs = docs_.unchecked<1>();
@@ -47,8 +48,8 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
    }

    // Mapping and it's length (1D).
-  int num_samples = -1;
-  uint32_t* maps = NULL;
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;

    // Perform two iterations, in the first iteration get the size
    // and allocate memory and in the second iteration populate the map.
@@ -59,9 +60,7 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
        srand(seed);

        // Set the flag on second iteration.
-    if (iteration == 1) {
-      second = true;
-    }
+        second = iteration == 1;

        // Counters:
        uint32_t empty_docs = 0;
@@ -72,7 +71,7 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,

        // For each epoch:
        for (int epoch=0; epoch < num_epochs; ++epoch) {
-      if (map_index >= max_num_samples) {
+            if (map_index >= max_num_samples && !second) {
                cout << " > reached " << max_num_samples << " samples after " <<
                        epoch << " epochs ..." << endl;
                break;
@@ -81,14 +80,14 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
            for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {

                // Document sentences are in [sent_index_first, sent_index_last).
-	const uint32_t sent_index_first = docs[doc];
-	const uint32_t sent_index_last = docs[doc + 1];
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];

                // At the begining of the document previous index is the start index.
-	uint32_t prev_start_index = sent_index_first;
+                auto prev_start_index = sent_index_first;

                // Remaining documents.
-	uint32_t num_remain_sent = sent_index_last - sent_index_first;
+                auto num_remain_sent = sent_index_last - sent_index_first;

                // Some bookkeeping
                if ((epoch == 0) && (!second)) {
@@ -107,12 +106,12 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
                if (num_remain_sent > 1) {

                    // Set values.
-	  uint32_t size = 0;
-	  uint32_t num_sent = 0;
-	  uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length);
+                    auto size = uint32_t{0};
+                    auto num_sent = uint32_t{0};
+                    auto seq_len = get_sample_len(short_seq_ratio, max_seq_length);

                    // Loop through sentences.
-	  for (uint32_t sent_index=sent_index_first;
+                    for (auto sent_index=sent_index_first;
                         sent_index < sent_index_last; ++sent_index) {

                        // Add the size and number of sentences.
@@ -129,13 +128,19 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,

                            // Populate the map.
                            if (second) {
-		const uint64_t map_index_0 = 3 * map_index;
+                                const auto map_index_0 = 3 * map_index;
                                maps[map_index_0] = prev_start_index;
                                maps[map_index_0 + 1] = sent_index + 1;
                                maps[map_index_0 + 2] = seq_len;
                            }

                            // Update indices / counters.
+                            // check for overflow
+                            if (map_index == std::numeric_limits<DocIdx>::max()) {
+                                cout << "number of samples exceeded maximum allowed by type: "
+                                     << std::numeric_limits<DocIdx>::max() << endl;
+                                throw std::overflow_error("Number of samples");
+                            }
                            map_index += 1;
                            prev_start_index = sent_index + 1;
                            seq_len = get_sample_len(short_seq_ratio, max_seq_length);
@@ -148,29 +153,24 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
            } // for (int doc=0; doc < num_docs; ++doc) {
        } // for (int epoch=0; epoch < num_epochs; ++epoch) {

-    // For now only support mappings up to MAX_INT.
-    if (map_index > std::numeric_limits<int>::max()) {
-      cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl;
-      throw(-1);
-    }
-    else if (!second) {
+        if (!second) {
            cout << "    number of samples:                      " <<
                    map_index << endl;
            cout << "    number of empty documents:              " <<
                    empty_docs << endl;
            cout << "    number of documents with one sentence:  " <<
                    one_sent_docs << endl;
-      maps = new uint32_t[3*map_index];
-      num_samples = int(map_index);
+            maps = new DocIdx[3*map_index];
+            num_samples = map_index;
        }

    } // for (int iteration=0; iteration < 2; ++iteration) {

    // Shuffle.
-  for (int i=(num_samples - 1); i > 0; --i) {
-    const int j = rand() % (i + 1);
-    uint64_t i0 = 3 * i;
-    uint64_t j0 = 3 * j;
+    for (auto i=(num_samples - 1); i > 0; --i) {
+        const auto j = rand() % (i + 1);
+        const auto i0 = 3 * i;
+        const auto j0 = 3 * j;
        // Swap values.
        swap(maps[i0], maps[j0]);
        swap(maps[i0 + 1], maps[j0 + 1]);
@@ -181,22 +181,35 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,

    // Method to deallocate memory.
    py::capsule free_when_done(maps, [](void *mem_) {
-      uint32_t *mem = reinterpret_cast<uint32_t *>(mem_);
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
            cout << "freeing memory for the dataset mapping" << endl;
            delete[] mem;
        });

    // Return the numpy array.
-  return py::array_t<uint32_t>({num_samples, 3}, // shape
+    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
                     {3*4, 4}, // C-style contiguous strides
                     maps, // the data pointer
                     free_when_done); // numpy array references

 }

+py::array build_mapping(const py::array& docs_,
+                        const py::array& sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed) {
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples,
+                                            max_seq_length, short_seq_prob, seed);
+    } else {
+        return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples,
+                                            max_seq_length, short_seq_prob, seed);
+    }
+}

 PYBIND11_MODULE(helpers, m) {
    m.def("build_mapping", &build_mapping);
 }
-
-
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -458,7 +458,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
            if self._index.dtype != np.int64:
                np_array = np_array.astype(np.int64)

-            return torch.from_numpy(np_array)
+            return np_array
        elif isinstance(idx, slice):
            start, stop, step = idx.indices(len(self))
            if step != 1:

--- a/megatron/data/test/test_indexed_dataset.py
+++ b/megatron/data/test/test_indexed_dataset.py
@@ -7,7 +7,7 @@ import torch
 script_dir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(os.path.join(script_dir, "../../../"))

-from megatron.data import indexed_dataset, FullBertTokenizer
+from megatron.data import indexed_dataset, FullBertTokenizer, AlbertDataset

 def test_indexed_dataset(args):
    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
@@ -31,18 +31,47 @@ def test_indexed_dataset(args):
                    print("Newline in string!")
        print(i)

+def test_albert_dataset(args):
+    # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
+    # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    # ds = AlbertDataset(idataset, tokenizer)
+    ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
+                                  args.epochs, args.max_num_samples,
+                                  args.masked_lm_prob, args.seq_length,
+                                  args.short_seq_prob, args.seed)
+    truncated = 0
+    total = 0
+    for s in ds:
+        ids = s['text']
+        tokens = ds.tokenizer.convert_ids_to_tokens(ids)
+        print(tokens)
+        exit()
+
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, help='prefix to data files')
    parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
    parser.add_argument('--dataset-impl', type=str, default='infer',
                        choices=['lazy', 'cached', 'mmap', 'infer'])
+    parser.add_argument('--epochs', type=int, default=5,
+                        help='Number of epochs to plan for')
+    parser.add_argument('--max-num-samples', type=int, default=None,
+                        help='Maximum number of samples to plan for')
+    parser.add_argument('--masked-lm-prob', type=float, default=0.15,
+                        help='probability of masking tokens')
+    parser.add_argument('--seq-length', type=int, default=512,
+                        help='maximum sequence length')
+    parser.add_argument('--short-seq-prob', type=float, default=0.1,
+                        help='probability of creating a short sequence')
+    parser.add_argument('--seed', type=int, default=1234,
+                        help='random seed')
    args = parser.parse_args()

    if args.dataset_impl == "infer":
        args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)

-    test_indexed_dataset(args)
+    test_albert_dataset(args)
+#    test_indexed_dataset(args)

 if __name__ == "__main__":
    main()