fixed padding issue

f6a6811f · Mohammad Shoeybi · 9ea9d50f · f6a6811f · f6a6811f · f6a6811f
Commit f6a6811f authored Nov 20, 2019 by Mohammad Shoeybi
5 changed files
--- a/megatron/data/albert_dataset.py
+++ b/megatron/data/albert_dataset.py
 """TO BE ADDED """

+import os
 import random
 import time
-import os

 import numpy as np
 import torch
 from torch.utils.data import Dataset

-from .dataset_utils import build_training_sample
-#from data.mapping import build_training_samples_mapping
-
-from . import helpers
+from megatron.data import helpers
 from megatron.data import FullBertTokenizer
+from megatron.data.dataset_utils import build_training_sample
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.utils import print_rank_0


 class AlbertDataset(Dataset):

-
-    def __init__(self,
-                 vocab_file, data_prefix, data_impl, skip_warmup,
-                 num_epochs, max_num_samples,
-                 masked_lm_prob, max_seq_length, short_seq_prob, seed):
+    def __init__(self, vocab_file, data_prefix, data_impl, skip_warmup,
+                 num_epochs, max_num_samples, masked_lm_prob, max_seq_length,
+                 short_seq_prob, seed):

        # Params to store.
        self.seed = seed
@@ -32,25 +28,26 @@ class AlbertDataset(Dataset):
        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=True)

        # Indexed dataset.
-        self.indexed_dataset = self._get_indexed_dataset(data_prefix, data_impl,
-                                                         skip_warmup)
+        self.indexed_dataset = get_indexed_dataset_(data_prefix,
+                                                    data_impl,
+                                                    skip_warmup)

        # Build the samples mapping.
-        self.samples_mapping = self._get_samples_mapping(self.indexed_dataset,
-                                                         data_prefix,
-                                                         num_epochs,
-                                                         max_num_samples,
-                                                         self.max_seq_length,
-                                                         short_seq_prob,
-                                                         self.seed)
+        self.samples_mapping = get_samples_mapping_(self.indexed_dataset,
+                                                    data_prefix,
+                                                    num_epochs,
+                                                    max_num_samples,
+                                                    self.max_seq_length,
+                                                    short_seq_prob,
+                                                    self.seed)

        # Vocab stuff.
-        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = tokenizer.inv_vocab
-        self.cls_id = tokenizer.vocab['[CLS]']
-        self.sep_id = tokenizer.vocab['[SEP]']
-        self.mask_id = tokenizer.vocab['[MASK]']
-        self.pad_id = tokenizer.vocab['[PAD]']
+        self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = self.tokenizer.inv_vocab
+        self.cls_id = self.tokenizer.vocab['[CLS]']
+        self.sep_id = self.tokenizer.vocab['[SEP]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']
+        self.pad_id = self.tokenizer.vocab['[PAD]']
        exit()


@@ -64,6 +61,8 @@ class AlbertDataset(Dataset):

    def __getitem__(self, idx):

+        # Note that this rng state should be python and not numpy since
+        # python randint is inclusive whereas the numpy one is exclusive.
        rng = random.Random(self.seed + idx)
        start_index, end_index, seq_length = self.samples_mapping[idx]
        sample = []
@@ -82,82 +81,81 @@ class AlbertDataset(Dataset):



-    def _get_indexed_dataset(self, data_prefix, data_impl, skip_warmup):
-        start_time = time.time()
-        print_rank_0("> Reading dataset index ...")
-        indexed_dataset = make_indexed_dataset(data_prefix,
-                                               data_impl,
-                                               skip_warmup)
-        print_rank_0("> Finished creating indexed dataset in {:4f} "
-                     "seconds".format(time.time() - start_time))
-        return indexed_dataset
-
-
-    def _get_samples_mapping(self,
-                             indexed_dataset,
-                             data_prefix,
-                             num_epochs,
-                             max_num_samples,
-                             max_seq_length,
-                             short_seq_prob,
-                             seed):
-        if not num_epochs:
-            if not max_num_samples:
-                raise ValueError("Need to specify either max_num_samples "
-                                 "or num_epochs")
-            num_epochs = np.iinfo(np.int32).max - 1
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+    start_time = time.time()
+    print_rank_0("> Reading dataset index ...")
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    print_rank_0("> Finished creating indexed dataset in {:4f} "
+                 "seconds".format(time.time() - start_time))
+    return indexed_dataset
+
+
+def get_samples_mapping_(indexed_dataset,
+                         data_prefix,
+                         num_epochs,
+                         max_num_samples,
+                         max_seq_length,
+                         short_seq_prob,
+                         seed):
+    if not num_epochs:
        if not max_num_samples:
-            max_num_samples = np.iinfo(np.int64).max - 1
-
-        # Filename of the index mapping
-        indexmap_filename = data_prefix
-        indexmap_filename += '_indexmap'
-        indexmap_filename += '_{}ep'.format(num_epochs)
-        indexmap_filename += '_{}mns'.format(max_num_samples)
-        indexmap_filename += '_{}msl'.format(max_seq_length)
-        indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
-        indexmap_filename += '_{}s'.format(seed)
-        indexmap_filename += '.npy'
-
-        # Build the indexed mapping if not exist.
-        if torch.distributed.get_rank() == 0 and \
-           not os.path.isfile(indexmap_filename):
-            print('WARNING: could not find index map file {}, building '
-                  'the indices on rank 0 ...'.format(indexmap_filename))
-            # Make sure the types match the helpers input types.
-            assert indexed_dataset.doc_idx.dtype == np.int64
-            assert indexed_dataset.sizes.dtype == np.int32
-
-            # Build samples mapping
-            verbose = torch.distributed.get_rank()==0
-            start_time = time.time()
-            samples_mapping = helpers.build_mapping(
-                indexed_dataset.doc_idx,
-                indexed_dataset.sizes,
-                num_epochs,
-                max_num_samples,
-                max_seq_length-3, # account for added tokens
-                short_seq_prob,
-                seed,
-                verbose)
-            np.save(indexmap_filename, samples_mapping, allow_pickle=True)
-            # Make sure all the ranks have built the mapping
-            print_rank_0('> elasped time to build and save samples mapping '
-                         '(seconds): {:4f}'.format(
-                             time.time() - start_time))
-        torch.distributed.barrier()
-
-        # Load indexed dataset.
-        print_rank_0('> loading indexed mapping from {}'.format(
-            indexmap_filename))
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_indexmap'
+    indexmap_filename += '_{}ep'.format(num_epochs)
+    indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
+    indexmap_filename += '_{}s'.format(seed)
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0 and \
+       not os.path.isfile(indexmap_filename):
+        print('WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+        # Make sure the types match the helpers input types.
+        assert indexed_dataset.doc_idx.dtype == np.int64
+        assert indexed_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
        start_time = time.time()
-        samples_mapping = np.load(indexmap_filename, allow_pickle=True)
-        print_rank_0('  loaded indexed file in {:3.3f} seconds'.format(
-            time.time() - start_time))
-        print_rank_0('  total number of samples: {}'.format(
-            samples_mapping.shape[0]))
+        samples_mapping = helpers.build_mapping(
+            indexed_dataset.doc_idx,
+            indexed_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length-3, # account for added tokens
+            short_seq_prob,
+            seed,
+            verbose)
+        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
+        # Make sure all the ranks have built the mapping
+        print_rank_0('> elasped time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+                         time.time() - start_time))
+    torch.distributed.barrier()
+
+    # Load indexed dataset.
+    print_rank_0('> loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+    samples_mapping = np.load(indexmap_filename, allow_pickle=True)
+    print_rank_0('  loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('  total number of samples: {}'.format(
+        samples_mapping.shape[0]))

-        return samples_mapping
+    return samples_mapping


 '''
@@ -274,6 +272,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
    return samples_np
 '''

+'''
 # WILL BE REPLACED WITH JARED'S
 class JaredDataset(object):

@@ -395,3 +394,4 @@ if __name__ == '__main__':
                            max_seq_length=512,
                            short_seq_prob=0.1,
                            seed=1234)
+'''
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -24,7 +24,9 @@ def build_training_sample(sample,
        mask_id: Mask token id.
        pad_id: Padding token id.
        masked_lm_prob: Probability to mask tokens.
-        rng: Random number genenrator.
+        rng: Random number genenrator. Note that this rng state should be
+              python and not numpy since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
    """

    # We assume that we have at least two sentences in the sample
@@ -36,8 +38,8 @@ def build_training_sample(sample,

    # Truncate to `target_sequence_length`.
    max_num_tokens = target_seq_length
-    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
-                                  max_num_tokens, rng)
+    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
+                                  len(tokens_b), max_num_tokens, rng)

    # Build tokens and toketypes.
    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
@@ -50,17 +52,17 @@ def build_training_sample(sample,
        cls_id, sep_id, mask_id, max_predictions_per_seq, rng)

    # Padding.
-    tokens_np, tokentypes_np, labels, padding_mask, loss_mask \
+    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
                                   masked_labels, pad_id, max_seq_length)

    train_sample = {
        'text': tokens_np,
        'types': tokentypes_np,
-        'labels': labels,
+        'labels': labels_np,
        'is_random': int(is_next_random),
-        'loss_mask': loss_mask,
-        'padding_mask': padding_mask,
+        'loss_mask': loss_mask_np,
+        'padding_mask': padding_mask_np,
        'truncated': int(truncated)}
    return train_sample

@@ -357,7 +359,8 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)

    # Padding mask.
-    padding_mask_np = np.array([1]*num_tokens + [0]*padding_length, dtype=np.int64)
+    padding_mask_np = np.array([1]*num_tokens + [0]*padding_length,
+                               dtype=np.int64)

    # Lables and loss mask.
    labels = [-1] * max_seq_length
@@ -372,8 +375,7 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np


-
-
+'''
 if __name__ == '__main__':


@@ -469,3 +471,4 @@ if __name__ == '__main__':
        string += '{:5d}'.format(tokentype)
        string += '{:5d}'.format(padding_mask)
        print(string)
+'''
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -145,7 +145,7 @@ class BertModel(MegatronModule):
            init_method=init_method,
            scaled_init_method=scaled_init_method_normal(init_method_std,
                                                         num_layers),
-            residual_connection_post_layernorm=True)
+            residual_connection_post_layernorm=False)

        self.lm_head = BertLMHead(
            self.language_model.embedding.word_embeddings.weight.size(0),

--- a/pretrain_albert.py
+++ b/pretrain_albert.py
@@ -73,7 +73,7 @@ def get_batch(data_iterator, timers):
    sentence_order = data_b['is_random'].long()
    loss_mask = data_b['loss_mask'].float()
    lm_labels = data_b['labels'].long()
-    padding_mask = data_b['padding_mask'].byte()
+    padding_mask = data_b['padding_mask'].long()

    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask

@@ -88,7 +88,7 @@ def forward_step(data_iterator, model, args, timers):
    timers('batch generator').stop()

    # Forward model.
-    lm_logits, sop_logits = model(tokens, 1-padding_mask, tokentype_ids=types)
+    lm_logits, sop_logits = model(tokens, padding_mask, tokentype_ids=types)

    sop_loss = F.cross_entropy(sop_logits.view(-1, 2).contiguous().float(),
                               sentence_order.view(-1).contiguous(),

--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -72,7 +72,7 @@ def get_batch(data_iterator, timers):
    next_sentence = data_b['is_random'].long()
    loss_mask = data_b['mask'].float()
    lm_labels = data_b['mask_labels'].long()
-    padding_mask = data_b['pad_mask'].byte()
+    padding_mask = data_b['pad_mask'].long()

    return tokens, types, next_sentence, loss_mask, lm_labels, padding_mask