Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
75a86a1d
Commit
75a86a1d
authored
Dec 03, 2019
by
Mohammad Shoeybi
Browse files
minor cleaning
parent
5fc301aa
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
48 additions
and
345 deletions
+48
-345
megatron/data/albert_dataset.py
megatron/data/albert_dataset.py
+16
-245
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+14
-100
megatron/data/helpers.cpp
megatron/data/helpers.cpp
+18
-0
No files found.
megatron/data/albert_dataset.py
View file @
75a86a1d
"""TO BE ADDED """
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ALBERT Style dataset."""
import
os
import
time
...
...
@@ -140,11 +155,6 @@ class AlbertDataset(Dataset):
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
sample
.
append
(
self
.
indexed_dataset
[
index
])
'''
for s in sample:
if len(s) > 1000:
print(self.tokenizer.convert_ids_to_tokens(s))
'''
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
...
...
@@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset,
samples_mapping
.
shape
[
0
]))
return
samples_mapping
'''
def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
"""With probability `short_seq_prob` generate a smaller sequence lenght."""
if np_rng.random() < short_seq_prob:
return np_rng.randint(2, max_num_tokens + 1)
return max_num_tokens
def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
short_seq_prob, seed):
"""Build a mapping to reconstruct training samples."""
start_time = time.time()
print('> building training samples mapping ...')
# RNG:
np_rng = np.random.RandomState(seed=seed)
# List of start sentence index and end sentence index (end is exclusive)
# to retrieve.
samples = []
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# Number of documents processed:
total_docs = 0
# Number of documents that are skipped:
skipped_docs = 0
# Number of empty documents:
empty_docs = 0
# For each epoch:
for epoch in range(num_epochs):
# For each document:
for doc_index in range(indexed_dataset.num_docs):
if epoch == 0:
total_docs += 1
# Document sentences are in [sent_index_first, sent_index_last).
sent_index_first = indexed_dataset.doc_idx[doc_index]
sent_index_last = indexed_dataset.doc_idx[doc_index+1]
assert sent_index_last >= sent_index_first
# Empty docs.
if (sent_index_last - sent_index_first) == 0:
if epoch == 0:
print('***WARNING*** document {} is empty'.format(
doc_index))
empty_docs += 1
continue
# Skip documents that only have one sentences.
if (sent_index_last - sent_index_first) == 1:
if epoch == 0:
print('***WARNING*** document {} has only one sentnece, '
'skipping ...'.format(doc_index))
skipped_docs += 1
continue
# Loop through sentences.
sent_index = sent_index_first
target_seq_length = get_target_seq_length(max_num_tokens,
short_seq_prob, np_rng)
size = 0
while sent_index < sent_index_last:
# Get the size.
assert indexed_dataset.sizes[sent_index] > 0
size += indexed_dataset.sizes[sent_index]
sent_index += 1
# If we have reached the target length.
exceeded_target_size = (size >= target_seq_length)
# If only one sentence is left in the document.
only_one_sent_left = (sent_index == (sent_index_last - 1))
# If we have at least two sentneces.
have_more_than_one_sent = (sent_index - sent_index_first) > 1
# If we have reached end of the document.
reached_end_of_doc = (sent_index == sent_index_last)
if (exceeded_target_size and not only_one_sent_left and
have_more_than_one_sent) or reached_end_of_doc:
assert (sent_index - sent_index_first) > 1
assert size > 1
# Add the sample.
samples.append([sent_index_first, sent_index,
target_seq_length])
# Reset indices
sent_index_first = sent_index
target_seq_length = get_target_seq_length(max_num_tokens,
short_seq_prob,
np_rng)
size = 0
num_sentences = 0
# Convert to numpy array.
samples_np = np.array(samples, dtype=np.int64)
# Shuffle.
np_rng.shuffle(samples_np)
elapsed_time = time.time() - start_time
# Print some stats:
print('
\n
***************************** info *****************************')
print(' elapsed time (sec) ..................... {}'.format(elapsed_time))
print(' number of epochs ....................... {}'.format(num_epochs))
print(' number of samples ...................... {}'.format(
samples_np.shape[0]))
print(' number of documents .................... {}'.format(total_docs))
print(' number of empty documents .............. {}'.format(empty_docs))
print(' number of documents with one sentence .. {}'.format(skipped_docs))
print('****************************************************************
\n
')
return samples_np
'''
'''
# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):
def __init__(self, doc_idx, sizes, sentences):
self.doc_idx = doc_idx
self.num_docs = len(self.doc_idx) - 1
self.sizes = sizes
self.sentences = sentences
def __getitem__(self, idx):
return self.sentences[idx]
if __name__ == '__main__':
print('dataset ...')
from bert_tokenization import FullTokenizer
import json
import nltk
nltk.download('punkt')
def document_generator_provider(input_file):
with open(input_file, 'r') as ifile:
for document in ifile:
data = json.loads(document)
text = data['text']
sentences = []
for line in text.split('
\n
'):
if line != '
\n
':
sent = nltk.tokenize.sent_tokenize(line)
if sent:
sentences.extend(sent)
yield sentences
input_file = 'test/samples_10000.json'
vocab_file = 'test/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
document_generator = document_generator_provider(input_file)
doc_idx = [0]
sizes = []
sentences_list = []
for sentences in document_generator:
num_sent = 0
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
if tokens:
ids = tokenizer.convert_tokens_to_ids(tokens)
if len(ids) == 0:
print('****************')
print(sentence)
print(tokens)
print(ids)
print('****************')
sizes.append(len(ids))
sentences_list.append(ids)
num_sent += 1
doc_idx.append(num_sent)
for i in range(1, len(doc_idx)):
doc_idx[i] += doc_idx[i-1]
#max_size = np.iinfo(np.int32).max // 32
import time
docs_np = np.array(doc_idx, dtype=np.uint32)
sizes_np = np.array(sizes, dtype=np.uint16)
start_time = time.time()
max_seq_length = 512
max_size = docs_np.shape[0]
lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
size=max_size//10, dtype=np.uint16)
lens_view = lens[:max_size//10]
np.copyto(lens_view, lens_rand)
np.random.shuffle(lens)
print('num docs', max_size)
print('lens time', time.time() - start_time)
import helpers
start_time = time.time()
maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
print('maps time', time.time() - start_time)
print(maps)
exit()
start_time = time.time()
max_size = 10 #np.iinfo(np.int32).max 32
docs = np.arange(10, dtype=np.uint32)
print(docs)
a = example.doit(docs, max_size)
print(type(a))
print(a.shape)
print(a)
print(time.time() - start_time)
exit()
#start_time = time.time()
count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
print(count)
maps = maps[:count]
np.random.shuffle(maps)
print(time.time() - start_time)
exit()
indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
tokenizer=tokenizer,
num_epochs=10,
masked_lm_prob=0.15,
max_seq_length=512,
short_seq_prob=0.1,
seed=1234)
'''
megatron/data/dataset_utils.py
View file @
75a86a1d
"""TO BE ADDED"""
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
collections
...
...
@@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
loss_mask_np
=
np
.
array
(
loss_mask
,
dtype
=
np
.
int64
)
return
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
'''
if __name__ == '__main__':
print('building the dataset ...')
from bert_tokenization import FullTokenizer
import json
import nltk
nltk.download('punkt')
def document_generator_provider(input_file):
with open(input_file, 'r') as ifile:
for document in ifile:
data = json.loads(document)
text = data['text']
sentences = []
for line in text.split('
\n
'):
if line != '
\n
':
sentences.extend(nltk.tokenize.sent_tokenize(line))
yield sentences
input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
document_generator = document_generator_provider(input_file)
samples = []
sizes = []
for sentences in document_generator:
tokens_list = []
size = 0
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
tokens_list.append(tokens)
size += len(tokens)
samples.append(tokens_list)
sizes.append(size)
print(sizes)
import random
rng = random.Random(123567)
vocab_id_list = list(tokenizer.inv_vocab.keys())
cls_id = tokenizer.vocab['[CLS]']
sep_id = tokenizer.vocab['[SEP]']
mask_id = tokenizer.vocab['[MASK]']
pad_id = tokenizer.vocab['[PAD]']
vocab_id_to_token_dict = tokenizer.inv_vocab
sample = []
for s in samples[0]:
sample.append(tokenizer.convert_tokens_to_ids(s))
max_seq_length = 512
target_seq_length = 444
masked_lm_prob = 0.15
example = build_training_sample(sample,
target_seq_length, max_seq_length,
vocab_id_list, vocab_id_to_token_dict,
cls_id, sep_id, mask_id, pad_id,
masked_lm_prob, rng)
orig_tokens = []
for s in samples[0]:
orig_tokens.extend(s)
is_random = example['is_random']
if is_random:
print('random')
else:
print('not-random')
#exit()
ii = 0
for i in range(max_seq_length):
token = tokenizer.inv_vocab[example['text'][i]]
if token in ['[CLS]', '[SEP]'] :
orig_token = token
elif ii < len(orig_tokens):
orig_token = orig_tokens[ii]
ii += 1
else:
orig_token = 'EMPTY'
tokentype = example['types'][i]
label_id = example['labels'][i]
label = 'NONE'
if label_id >= 0:
label = tokenizer.inv_vocab[label_id]
loss_mask = example['loss_mask'][i]
padding_mask = example['padding_mask'][i]
string = ''
string += '{:15s}'.format(orig_token)
string += '{:15s}'.format(token)
string += '{:15s}'.format(label)
string += '{:5d}'.format(loss_mask)
string += '{:5d}'.format(tokentype)
string += '{:5d}'.format(padding_mask)
print(string)
'''
megatron/data/helpers.cpp
View file @
75a86a1d
/*
coding=utf-8
Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/* Helper methods for fast index mapping builds */
#include <algorithm>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment