c++ code working

f51ceb7c · Mohammad Shoeybi · 6140718f · f51ceb7c · f51ceb7c
Commit f51ceb7c authored Nov 10, 2019 by Mohammad Shoeybi
Hide whitespace changes
Inline Side-by-side

Showing with 255 additions and 4 deletions

megatron/data/dataset.py megatron/data/dataset.py +53 -4

megatron/data/helpers.cpp megatron/data/helpers.cpp +202 -0

No files found.
--- a/megatron/data/dataset.py
+++ b/megatron/data/dataset.py
@@ -8,7 +8,7 @@ import torch
 from torch.utils.data import Dataset

 from dataset_utils import build_training_sample
-
+#from data.mapping import build_training_samples_mapping

 class AlbertDataSet(Dataset):

@@ -57,7 +57,7 @@ class AlbertDataSet(Dataset):
                                     self.mask_id, self.pad_id,
                                     self.masked_lm_prob, rng)

-
+'''
 def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
    """With probability `short_seq_prob` generate a smaller sequence lenght."""
    if np_rng.random() < short_seq_prob:
@@ -169,7 +169,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
    print('****************************************************************\n')

    return samples_np
-
+'''

 # WILL BE REPLACED WITH JARED'S
 class JaredDataset(object):
@@ -207,7 +207,7 @@ if __name__ == '__main__':
                            sentences.extend(sent)
                yield sentences

-    input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
+    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
@@ -236,6 +236,55 @@ if __name__ == '__main__':
    for i in range(1, len(doc_idx)):
        doc_idx[i] += doc_idx[i-1]

+    #max_size = np.iinfo(np.int32).max // 32
+
+    import time
+
+    docs_np = np.array(doc_idx, dtype=np.uint32)
+    sizes_np = np.array(sizes, dtype=np.uint16)
+
+    start_time = time.time()
+    max_seq_length = 512
+    max_size = docs_np.shape[0]
+    lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
+    lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
+                                  size=max_size//10, dtype=np.uint16)
+    lens_view = lens[:max_size//10]
+    np.copyto(lens_view, lens_rand)
+    np.random.shuffle(lens)
+    print('num docs', max_size)
+    print('lens time', time.time() - start_time)
+
+    import helpers
+    start_time = time.time()
+    maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
+    print('maps time', time.time() - start_time)
+    print(maps)
+    exit()
+
+    start_time = time.time()
+    max_size = 10 #np.iinfo(np.int32).max 32
+    docs = np.arange(10, dtype=np.uint32)
+    print(docs)
+
+    a = example.doit(docs, max_size)
+    print(type(a))
+    print(a.shape)
+    print(a)
+    print(time.time() - start_time)
+    exit()
+
+
+    #start_time = time.time()
+    count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
+    print(count)
+    maps = maps[:count]
+    np.random.shuffle(maps)
+    print(time.time() - start_time)
+
+
+    exit()
+
    indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
    dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
                            tokenizer=tokenizer,

--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+namespace py = pybind11;
+using namespace std;
+
+
+inline uint32_t get_sample_len(const int short_seq_ratio,
+			       const uint32_t max_length) {
+  /* Training sample length. */
+  const auto random_number = rand();
+  if ((random_number % short_seq_ratio) == 0) {
+    return 2 + random_number % (max_length - 1);
+  }
+  return max_length;
+}
+
+
+py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
+				    const py::array_t<uint16_t>& sizes_,
+				    const int num_epochs,
+				    const int max_num_samples,
+				    const int max_seq_length,
+				    const double short_seq_prob,
+				    const int seed) {
+
+  cout << "> building dataset mapping for " << docs_.shape(0) - 1 <<
+    " documents with " << sizes_.shape(0) << " sentences ..." << endl;
+
+  // For efficiency, convert probability to ratio.
+  const int short_seq_ratio = int(round(1.0 / short_seq_prob));
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+
+  // Check for consistency.
+  if (docs[docs.shape(0) - 1] != sizes.shape(0)) {
+    cout << "document values is not consistent with length of sizes: " <<
+      docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl;
+    throw(-1);
+    }
+
+  // Mapping and it's length (1D).
+  int num_samples = -1;
+  uint32_t* maps = NULL;
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Set the seed so both iterations produce the same results.
+    srand(seed);
+
+    // Set the flag on second iteration.
+    if (iteration == 1) {
+      second = true;
+    }
+
+    // Counters:
+    uint32_t empty_docs = 0;
+    uint32_t one_sent_docs = 0;
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    // For each epoch:
+    for (int epoch=0; epoch < num_epochs; ++epoch) {
+      if (map_index >= max_num_samples) {
+	cout << " > reached " << max_num_samples << " samples after " <<
+	  epoch << " epochs ..." << endl;
+	break;
+      }
+      // For each document:
+      for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {
+
+	// Document sentences are in [sent_index_first, sent_index_last).
+	const uint32_t sent_index_first = docs[doc];
+	const uint32_t sent_index_last = docs[doc + 1];
+
+	// At the begining of the document previous index is the start index.
+	uint32_t prev_start_index = sent_index_first;
+
+	// Remaining documents.
+	uint32_t num_remain_sent = sent_index_last - sent_index_first;
+
+	// Some bookkeeping
+	if ((epoch == 0) && (!second)) {
+	  if (num_remain_sent == 0) {
+	    cout << "***WARNING*** document " << doc << " is empty" << endl;
+	    empty_docs += 1;
+	  }
+	  if (num_remain_sent == 1) {
+	    cout << "***WARNING*** document " << doc <<
+	      " has one sentence" << endl;
+	    one_sent_docs += 1;
+	  }
+	}
+
+	// If we have more than two sentences.
+	if (num_remain_sent > 1) {
+
+	  // Set values.
+	  uint32_t size = 0;
+	  uint32_t num_sent = 0;
+	  uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length);
+
+	  // Loop through sentences.
+	  for (uint32_t sent_index=sent_index_first;
+	       sent_index < sent_index_last; ++sent_index) {
+
+	    // Add the size and number of sentences.
+	    size += sizes[sent_index];
+	    num_sent += 1;
+	    num_remain_sent -= 1;
+
+	    // If we have reached the target length.
+	    // and if not only one sentence is left in the document.
+	    // and if we have at least two sentneces.
+	    // and if we have reached end of the document.
+	    if (((size >= seq_len) && (num_remain_sent > 1) &&
+		 (num_sent > 1) ) || (num_remain_sent == 0)) {
+
+	      // Populate the map.
+	      if (second) {
+		const uint64_t map_index_0 = 3 * map_index;
+		maps[map_index_0] = prev_start_index;
+		maps[map_index_0 + 1] = sent_index + 1;
+		maps[map_index_0 + 2] = seq_len;
+	      }
+
+	      // Update indices / counters.
+	      map_index += 1;
+	      prev_start_index = sent_index + 1;
+	      seq_len = get_sample_len(short_seq_ratio, max_seq_length);
+	      size = 0;
+	      num_sent = 0;
+	    }
+	  }
+
+	} // if (num_remain_sent > 1) {
+      } // for (int doc=0; doc < num_docs; ++doc) {
+    } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    // For now only support mappings up to MAX_INT.
+    if (map_index > std::numeric_limits<int>::max()) {
+      cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl;
+      throw(-1);
+    }
+    else if (!second) {
+      cout << "    number of samples:                      " <<
+	map_index << endl;
+      cout << "    number of empty documents:              " <<
+	empty_docs << endl;
+      cout << "    number of documents with one sentence:  " <<
+	one_sent_docs << endl;
+      maps = new uint32_t[3*map_index];
+      num_samples = int(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  for (int i=(num_samples - 1); i > 0; --i) {
+    const int j = rand() % (i + 1);
+    uint64_t i0 = 3 * i;
+    uint64_t j0 = 3 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+  }
+
+  cout << " > done building the mapping." << endl;
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_) {
+      uint32_t *mem = reinterpret_cast<uint32_t *>(mem_);
+      cout << "freeing memory for the dataset mapping" << endl;
+      delete[] mem;
+    });
+
+  // Return the numpy array.
+  return py::array_t<uint32_t>({num_samples, 3}, // shape
+			       {3*4, 4}, // C-style contiguous strides
+			       maps, // the data pointer
+			       free_when_done); // numpy array references
+
+}
+
+
+PYBIND11_MODULE(helpers, m) {
+  m.def("build_mapping", &build_mapping);
+}
+
+