addressed Jareds comments

1c1a55da · Mohammad · 898fcb94 · 1c1a55da · 1c1a55da
Commit 1c1a55da authored Apr 09, 2020 by Mohammad
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 3 deletions

megatron/data/gpt2_dataset.py megatron/data/gpt2_dataset.py +1 -1

megatron/data/helpers.cpp megatron/data/helpers.cpp +4 -2

No files found.
--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -253,7 +253,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
                      num_epochs, tokens_per_epoch):
    """Sample index mapping is a 2D array with sizes
    [number-of-samples + 1, 2] where [..., 0] contains
-    the index into `doc_idx` and [..., 0] is the
+    the index into `doc_idx` and [..., 1] is the
    starting offset in that document."""

    # Total number of samples. For -1 see comments in `_num_epochs`.

--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -38,8 +38,10 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
 			   const int32_t seq_length,
 			   const int32_t num_epochs,
 			   const int64_t tokens_per_epoch) {
-    /* Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2]
-       where [..., 0] contains the index into `doc_idx` and [..., 0] is the
+    /* Sample index (sample_idx) is used for gpt2 like dataset for which
+       the documents are flattened and the samples are built based on this
+       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
       starting offset in that document.*/

    // Consistency checks.