minor cleaning

75a86a1d · Mohammad Shoeybi · 5fc301aa · 75a86a1d · 75a86a1d · 75a86a1d
Commit 75a86a1d authored Dec 03, 2019 by Mohammad Shoeybi
Showing with 48 additions and 345 deletions

megatron/data/albert_dataset.py megatron/data/albert_dataset.py +16 -245

megatron/data/dataset_utils.py megatron/data/dataset_utils.py +14 -100

megatron/data/helpers.cpp megatron/data/helpers.cpp +18 -0

No files found.
--- a/megatron/data/albert_dataset.py
+++ b/megatron/data/albert_dataset.py
-"""TO BE ADDED """
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ALBERT Style dataset."""
 import os
 import time
@@ -140,11 +155,6 @@ class AlbertDataset(Dataset):
        sample = []
        for index in range(start_index, end_index):
            sample.append(self.indexed_dataset[index])
-        '''
-        for s in sample:
-            if len(s) > 1000:
-                print(self.tokenizer.convert_ids_to_tokens(s))
-        '''
        # Note that this rng state should be numpy and not python since
        # python randint is inclusive whereas the numpy one is exclusive.
        np_rng = np.random.RandomState(seed=(self.seed + idx))
@@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset,
        samples_mapping.shape[0]))
    return samples_mapping
-'''
-def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
-    """With probability `short_seq_prob` generate a smaller sequence lenght."""
-    if np_rng.random() < short_seq_prob:
-        return np_rng.randint(2, max_num_tokens + 1)
-    return max_num_tokens
-def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
-                                   short_seq_prob, seed):
-    """Build a mapping to reconstruct training samples."""
-    start_time = time.time()
-    print('> building training samples mapping ...')
-    # RNG:
-    np_rng = np.random.RandomState(seed=seed)
-    # List of start sentence index and end sentence index (end is exclusive)
-    # to retrieve.
-    samples = []
-    # Account for [CLS], [SEP], [SEP]
-    max_num_tokens = max_seq_length - 3
-    # Number of documents processed:
-    total_docs = 0
-    # Number of documents that are skipped:
-    skipped_docs = 0
-    # Number of empty documents:
-    empty_docs = 0
-    # For each epoch:
-    for epoch in range(num_epochs):
-        # For each document:
-        for doc_index in range(indexed_dataset.num_docs):
-            if epoch == 0:
-                total_docs += 1
-            # Document sentences are in [sent_index_first, sent_index_last).
-            sent_index_first = indexed_dataset.doc_idx[doc_index]
-            sent_index_last = indexed_dataset.doc_idx[doc_index+1]
-            assert sent_index_last >= sent_index_first
-            # Empty docs.
-            if (sent_index_last - sent_index_first) == 0:
-                if epoch == 0:
-                    print('***WARNING*** document {} is empty'.format(
-                        doc_index))
-                    empty_docs += 1
-                continue
-            # Skip documents that only have one sentences.
-            if (sent_index_last - sent_index_first) == 1:
-                if epoch == 0:
-                    print('***WARNING*** document {} has only one sentnece, '
-                          'skipping ...'.format(doc_index))
-                    skipped_docs += 1
-                continue
-            # Loop through sentences.
-            sent_index = sent_index_first
-            target_seq_length = get_target_seq_length(max_num_tokens,
-                                                      short_seq_prob, np_rng)
-            size = 0
-            while sent_index < sent_index_last:
-                # Get the size.
-                assert indexed_dataset.sizes[sent_index] > 0
-                size += indexed_dataset.sizes[sent_index]
-                sent_index += 1
-                # If we have reached the target length.
-                exceeded_target_size = (size >= target_seq_length)
-                # If only one sentence is left in the document.
-                only_one_sent_left = (sent_index == (sent_index_last - 1))
-                # If we have at least two sentneces.
-                have_more_than_one_sent = (sent_index - sent_index_first) > 1
-                # If we have reached end of the document.
-                reached_end_of_doc = (sent_index == sent_index_last)
-                if (exceeded_target_size and not only_one_sent_left and
-                    have_more_than_one_sent) or reached_end_of_doc:
-                    assert (sent_index - sent_index_first) > 1
-                    assert size > 1
-                    # Add the sample.
-                    samples.append([sent_index_first, sent_index,
-                                    target_seq_length])
-                    # Reset indices
-                    sent_index_first = sent_index
-                    target_seq_length = get_target_seq_length(max_num_tokens,
-                                                              short_seq_prob,
-                                                              np_rng)
-                    size = 0
-                    num_sentences = 0
-    # Convert to numpy array.
-    samples_np = np.array(samples, dtype=np.int64)
-    # Shuffle.
-    np_rng.shuffle(samples_np)
-    elapsed_time = time.time() - start_time
-    # Print some stats:
-    print('\n***************************** info *****************************')
-    print('   elapsed time (sec) ..................... {}'.format(elapsed_time))
-    print('   number of epochs ....................... {}'.format(num_epochs))
-    print('   number of samples ...................... {}'.format(
-        samples_np.shape[0]))
-    print('   number of documents .................... {}'.format(total_docs))
-    print('   number of empty documents .............. {}'.format(empty_docs))
-    print('   number of documents with one sentence .. {}'.format(skipped_docs))
-    print('****************************************************************\n')
-    return samples_np
-'''
-'''
-# WILL BE REPLACED WITH JARED'S
-class JaredDataset(object):
-    def __init__(self, doc_idx, sizes, sentences):
-        self.doc_idx = doc_idx
-        self.num_docs = len(self.doc_idx) - 1
-        self.sizes = sizes
-        self.sentences = sentences
-    def __getitem__(self, idx):
-        return self.sentences[idx]
-if __name__ == '__main__':
-    print('dataset ...')
-    from bert_tokenization import FullTokenizer
-    import json
-    import nltk
-    nltk.download('punkt')
-    def document_generator_provider(input_file):
-        with open(input_file, 'r') as ifile:
-            for document in ifile:
-                data = json.loads(document)
-                text = data['text']
-                sentences = []
-                for line in text.split('\n'):
-                    if line != '\n':
-                        sent = nltk.tokenize.sent_tokenize(line)
-                        if sent:
-                            sentences.extend(sent)
-                yield sentences
-    input_file = 'test/samples_10000.json'
-    vocab_file = 'test/vocab.txt'
-    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
-    document_generator = document_generator_provider(input_file)
-    doc_idx = [0]
-    sizes = []
-    sentences_list = []
-    for sentences in document_generator:
-        num_sent = 0
-        for sentence in sentences:
-            tokens = tokenizer.tokenize(sentence)
-            if tokens:
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                if len(ids) == 0:
-                    print('****************')
-                    print(sentence)
-                    print(tokens)
-                    print(ids)
-                    print('****************')
-                sizes.append(len(ids))
-                sentences_list.append(ids)
-                num_sent += 1
-        doc_idx.append(num_sent)
-    for i in range(1, len(doc_idx)):
-        doc_idx[i] += doc_idx[i-1]
-    #max_size = np.iinfo(np.int32).max // 32
-    import time
-    docs_np = np.array(doc_idx, dtype=np.uint32)
-    sizes_np = np.array(sizes, dtype=np.uint16)
-    start_time = time.time()
-    max_seq_length = 512
-    max_size = docs_np.shape[0]
-    lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
-    lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
-                                  size=max_size//10, dtype=np.uint16)
-    lens_view = lens[:max_size//10]
-    np.copyto(lens_view, lens_rand)
-    np.random.shuffle(lens)
-    print('num docs', max_size)
-    print('lens time', time.time() - start_time)
-    import helpers
-    start_time = time.time()
-    maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
-    print('maps time', time.time() - start_time)
-    print(maps)
-    exit()
-    start_time = time.time()
-    max_size = 10 #np.iinfo(np.int32).max 32
-    docs = np.arange(10, dtype=np.uint32)
-    print(docs)
-    a = example.doit(docs, max_size)
-    print(type(a))
-    print(a.shape)
-    print(a)
-    print(time.time() - start_time)
-    exit()
-    #start_time = time.time()
-    count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
-    print(count)
-    maps = maps[:count]
-    np.random.shuffle(maps)
-    print(time.time() - start_time)
-    exit()
-    indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
-    dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
-                            tokenizer=tokenizer,
-                            num_epochs=10,
-                            masked_lm_prob=0.15,
-                            max_seq_length=512,
-                            short_seq_prob=0.1,
-                            seed=1234)
-'''
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
-"""TO BE ADDED"""
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import collections
@@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
    loss_mask_np = np.array(loss_mask, dtype=np.int64)
    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
-'''
-if __name__ == '__main__':
-    print('building the dataset ...')
-    from bert_tokenization import FullTokenizer
-    import json
-    import nltk
-    nltk.download('punkt')
-    def document_generator_provider(input_file):
-        with open(input_file, 'r') as ifile:
-            for document in ifile:
-                data = json.loads(document)
-                text = data['text']
-                sentences = []
-                for line in text.split('\n'):
-                    if line != '\n':
-                        sentences.extend(nltk.tokenize.sent_tokenize(line))
-                yield sentences
-    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
-    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
-    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
-    document_generator = document_generator_provider(input_file)
-    samples = []
-    sizes = []
-    for sentences in document_generator:
-        tokens_list = []
-        size = 0
-        for sentence in sentences:
-            tokens = tokenizer.tokenize(sentence)
-            tokens_list.append(tokens)
-            size += len(tokens)
-        samples.append(tokens_list)
-        sizes.append(size)
-    print(sizes)
-    import random
-    rng = random.Random(123567)
-    vocab_id_list = list(tokenizer.inv_vocab.keys())
-    cls_id = tokenizer.vocab['[CLS]']
-    sep_id = tokenizer.vocab['[SEP]']
-    mask_id = tokenizer.vocab['[MASK]']
-    pad_id = tokenizer.vocab['[PAD]']
-    vocab_id_to_token_dict = tokenizer.inv_vocab
-    sample = []
-    for s in samples[0]:
-        sample.append(tokenizer.convert_tokens_to_ids(s))
-    max_seq_length = 512
-    target_seq_length = 444
-    masked_lm_prob = 0.15
-    example = build_training_sample(sample,
-                                    target_seq_length, max_seq_length,
-                                    vocab_id_list, vocab_id_to_token_dict,
-                                    cls_id, sep_id, mask_id, pad_id,
-                                    masked_lm_prob, rng)
-    orig_tokens = []
-    for s in samples[0]:
-        orig_tokens.extend(s)
-    is_random = example['is_random']
-    if is_random:
-        print('random')
-    else:
-        print('not-random')
-    #exit()
-    ii = 0
-    for i in range(max_seq_length):
-        token = tokenizer.inv_vocab[example['text'][i]]
-        if token in ['[CLS]', '[SEP]'] :
-            orig_token = token
-        elif ii < len(orig_tokens):
-            orig_token = orig_tokens[ii]
-            ii += 1
-        else:
-            orig_token = 'EMPTY'
-        tokentype = example['types'][i]
-        label_id = example['labels'][i]
-        label = 'NONE'
-        if label_id >= 0:
-            label = tokenizer.inv_vocab[label_id]
-        loss_mask = example['loss_mask'][i]
-        padding_mask = example['padding_mask'][i]
-        string = ''
-        string += '{:15s}'.format(orig_token)
-        string += '{:15s}'.format(token)
-        string += '{:15s}'.format(label)
-        string += '{:5d}'.format(loss_mask)
-        string += '{:5d}'.format(tokentype)
-        string += '{:5d}'.format(padding_mask)
-        print(string)
-'''
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
+/*
+ coding=utf-8
+ Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
 /* Helper methods for fast index mapping builds */
 #include <algorithm>