before optimization

6140718f · Mohammad Shoeybi · c125d247 · 6140718f · 6140718f
Commit 6140718f authored Nov 08, 2019 by Mohammad Shoeybi
Show whitespace changes
Inline Side-by-side

Showing with 85 additions and 64 deletions

megatron/data/dataset.py megatron/data/dataset.py +72 -57

megatron/data/dataset_utils.py megatron/data/dataset_utils.py +13 -7

No files found.
--- a/megatron/data/dataset.py
+++ b/megatron/data/dataset.py
@@ -7,18 +7,55 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset
+from dataset_utils import build_training_sample
-# WILL BE REPLACED WITH JARED'S
-class JaredDataset(object):
-    def __init__(self, doc_idx, sizes, sentences):
+class AlbertDataSet(Dataset):
-        self.doc_idx = doc_idx
-        self.num_docs = len(self.doc_idx) - 1
+    def __init__(self, indexed_dataset, tokenizer, num_epochs,
-        self.sizes = sizes
+                 masked_lm_prob, max_seq_length, short_seq_prob, seed):
-        self.sentences = sentences
+        # Params to store.
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+        # Indexed dataset.
+        self.indexed_dataset = indexed_dataset
+        # Build the samples mapping.
+        self.samples_mapping = build_training_samples_mapping(
+            indexed_dataset,
+            num_epochs,
+            self.max_seq_length,
+            short_seq_prob,
+            self.seed)
+        # Vocab stuff.
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        self.cls_id = tokenizer.vocab['[CLS]']
+        self.sep_id = tokenizer.vocab['[SEP]']
+        self.mask_id = tokenizer.vocab['[MASK]']
+        self.pad_id = tokenizer.vocab['[PAD]']
+    def __len__(self):
+        return self.samples.shape[0]
    def __getitem__(self, idx):
-        return self.sentences[idx]
+        rng = random.Random(self.seed + idx)
+        start_index, end_index, seq_length = self.samples_mapping[idx]
+        sample = []
+        for index in range(start_index, end_index):
+            sample.append(self.indexed_dataset[index])
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_id, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.masked_lm_prob, rng)
 def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
            while sent_index < sent_index_last:
                # Get the size.
+                assert indexed_dataset.sizes[sent_index] > 0
                size += indexed_dataset.sizes[sent_index]
                sent_index += 1
@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
    return samples_np
-class AlbertDataSet(Dataset):
+# WILL BE REPLACED WITH JARED'S
+class JaredDataset(object):
-    def __init__(self, indexed_dataset, tokenizer, num_epochs,
-                 masked_lm_prob, max_seq_length, short_seq_prob, seed):
-        # Params to store.
-        self.seed = seed
-        self.masked_lm_prob = masked_lm_prob
-        self.max_seq_length = max_seq_length
-        # Indexed dataset.
-        self.indexed_dataset = indexed_dataset
-        # Build the samples mapping.
-        self.samples_mapping = build_training_samples_mapping(
-            indexed_dataset,
-            num_epochs,
-            self.max_seq_length,
-            short_seq_prob,
-            self.seed)
-        # Vocab stuff.
-        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = tokenizer.inv_vocab
-        self.cls_id = tokenizer.vocab['[CLS]']
-        self.sep_id = tokenizer.vocab['[SEP]']
-        self.mask_id = tokenizer.vocab['[MASK]']
-        self.pad_id = tokenizer.vocab['[PAD]']
-    def __len__(self):
+    def __init__(self, doc_idx, sizes, sentences):
-        return self.samples.shape[0]
+        self.doc_idx = doc_idx
+        self.num_docs = len(self.doc_idx) - 1
+        self.sizes = sizes
+        self.sentences = sentences
    def __getitem__(self, idx):
-        rng = random.Random(self.seed + idx)
+        return self.sentences[idx]
-        start_index, end_index = self.samples_mapping[idx]
-        sample = []
-        for index in range(start_index, end_index):
-            sample.append(self.indexed_dataset[index])
-        return build_training_sample(sample, self.vocab_id_list,
-                                     self.vocab_id_to_token_dict,
-                                     self.cls_id, self.sep_id,
-                                     self.mask_id, self.pad_id,
-                                     self.masked_lm_prob, self.max_seq_length,
-                                     rng)
@@ -198,10 +202,12 @@ if __name__ == '__main__':
                sentences = []
                for line in text.split('\n'):
                    if line != '\n':
-                        sentences.extend(nltk.tokenize.sent_tokenize(line))
+                        sent = nltk.tokenize.sent_tokenize(line)
+                        if sent:
+                            sentences.extend(sent)
                yield sentences
-    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
+    input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
@@ -212,19 +218,28 @@ if __name__ == '__main__':
    sentences_list = []
    for sentences in document_generator:
-        doc_idx.append(len(sentences))
+        num_sent = 0
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
+            if tokens:
                ids = tokenizer.convert_tokens_to_ids(tokens)
+                if len(ids) == 0:
+                    print('****************')
+                    print(sentence)
+                    print(tokens)
+                    print(ids)
+                    print('****************')
                sizes.append(len(ids))
                sentences_list.append(ids)
+                num_sent += 1
+        doc_idx.append(num_sent)
    for i in range(1, len(doc_idx)):
        doc_idx[i] += doc_idx[i-1]
    indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
    dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
                            tokenizer=tokenizer,
-                            num_epochs=3,
+                            num_epochs=10,
                            masked_lm_prob=0.15,
                            max_seq_length=512,
                            short_seq_prob=0.1,

--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -5,13 +5,18 @@ import collections
 import numpy as np
-def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
+def build_training_sample(sample,
+                          target_seq_length, max_seq_length,
+                          vocab_id_list, vocab_id_to_token_dict,
                          cls_id, sep_id, mask_id, pad_id,
-                          masked_lm_prob, max_seq_length, rng):
+                          masked_lm_prob, rng):
    """Biuld training sample.
    Arguments:
        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
        vocab_id_list: List of vocabulary ids. Used to pick a random id.
        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
        cls_id: Start of example id.
@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
        mask_id: Mask token id.
        pad_id: Padding token id.
        masked_lm_prob: Probability to mask tokens.
-        max_seq_length: Maximum length of the sequence. All values are padded to
-            this length.
        rng: Random number genenrator.
    """
    # We assume that we have at least two sentences in the sample
    assert len(sample) > 1
+    assert target_seq_length <= max_seq_length
    # Divide sample into two segments (A and B).
    tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)
-    # Truncate to `max_sequence_length`.
+    # Truncate to `target_sequence_length`.
    # Note that we have account for [CLS] A [SEP] B [SEP]
-    max_num_tokens = max_seq_length - 3
+    max_num_tokens = target_seq_length - 3
    truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
                      max_num_tokens, rng)
@@ -421,11 +425,13 @@ if __name__ == '__main__':
    for s in samples[0]:
        sample.append(tokenizer.convert_tokens_to_ids(s))
    max_seq_length = 512
+    target_seq_length = 444
    masked_lm_prob = 0.15
    example = build_training_sample(sample,
+                                    target_seq_length, max_seq_length,
                                    vocab_id_list, vocab_id_to_token_dict,
                                    cls_id, sep_id, mask_id, pad_id,
-                                    masked_lm_prob, max_seq_length, rng)
+                                    masked_lm_prob, rng)
    orig_tokens = []
    for s in samples[0]: