Commit 75a86a1d authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

minor cleaning

parent 5fc301aa
"""TO BE ADDED """ # coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ALBERT Style dataset."""
import os import os
import time import time
...@@ -140,11 +155,6 @@ class AlbertDataset(Dataset): ...@@ -140,11 +155,6 @@ class AlbertDataset(Dataset):
sample = [] sample = []
for index in range(start_index, end_index): for index in range(start_index, end_index):
sample.append(self.indexed_dataset[index]) sample.append(self.indexed_dataset[index])
'''
for s in sample:
if len(s) > 1000:
print(self.tokenizer.convert_ids_to_tokens(s))
'''
# Note that this rng state should be numpy and not python since # Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive. # python randint is inclusive whereas the numpy one is exclusive.
np_rng = np.random.RandomState(seed=(self.seed + idx)) np_rng = np.random.RandomState(seed=(self.seed + idx))
...@@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset, ...@@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset,
samples_mapping.shape[0])) samples_mapping.shape[0]))
return samples_mapping return samples_mapping
'''
def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
"""With probability `short_seq_prob` generate a smaller sequence lenght."""
if np_rng.random() < short_seq_prob:
return np_rng.randint(2, max_num_tokens + 1)
return max_num_tokens
def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
short_seq_prob, seed):
"""Build a mapping to reconstruct training samples."""
start_time = time.time()
print('> building training samples mapping ...')
# RNG:
np_rng = np.random.RandomState(seed=seed)
# List of start sentence index and end sentence index (end is exclusive)
# to retrieve.
samples = []
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# Number of documents processed:
total_docs = 0
# Number of documents that are skipped:
skipped_docs = 0
# Number of empty documents:
empty_docs = 0
# For each epoch:
for epoch in range(num_epochs):
# For each document:
for doc_index in range(indexed_dataset.num_docs):
if epoch == 0:
total_docs += 1
# Document sentences are in [sent_index_first, sent_index_last).
sent_index_first = indexed_dataset.doc_idx[doc_index]
sent_index_last = indexed_dataset.doc_idx[doc_index+1]
assert sent_index_last >= sent_index_first
# Empty docs.
if (sent_index_last - sent_index_first) == 0:
if epoch == 0:
print('***WARNING*** document {} is empty'.format(
doc_index))
empty_docs += 1
continue
# Skip documents that only have one sentences.
if (sent_index_last - sent_index_first) == 1:
if epoch == 0:
print('***WARNING*** document {} has only one sentnece, '
'skipping ...'.format(doc_index))
skipped_docs += 1
continue
# Loop through sentences.
sent_index = sent_index_first
target_seq_length = get_target_seq_length(max_num_tokens,
short_seq_prob, np_rng)
size = 0
while sent_index < sent_index_last:
# Get the size.
assert indexed_dataset.sizes[sent_index] > 0
size += indexed_dataset.sizes[sent_index]
sent_index += 1
# If we have reached the target length.
exceeded_target_size = (size >= target_seq_length)
# If only one sentence is left in the document.
only_one_sent_left = (sent_index == (sent_index_last - 1))
# If we have at least two sentneces.
have_more_than_one_sent = (sent_index - sent_index_first) > 1
# If we have reached end of the document.
reached_end_of_doc = (sent_index == sent_index_last)
if (exceeded_target_size and not only_one_sent_left and
have_more_than_one_sent) or reached_end_of_doc:
assert (sent_index - sent_index_first) > 1
assert size > 1
# Add the sample.
samples.append([sent_index_first, sent_index,
target_seq_length])
# Reset indices
sent_index_first = sent_index
target_seq_length = get_target_seq_length(max_num_tokens,
short_seq_prob,
np_rng)
size = 0
num_sentences = 0
# Convert to numpy array.
samples_np = np.array(samples, dtype=np.int64)
# Shuffle.
np_rng.shuffle(samples_np)
elapsed_time = time.time() - start_time
# Print some stats:
print('\n***************************** info *****************************')
print(' elapsed time (sec) ..................... {}'.format(elapsed_time))
print(' number of epochs ....................... {}'.format(num_epochs))
print(' number of samples ...................... {}'.format(
samples_np.shape[0]))
print(' number of documents .................... {}'.format(total_docs))
print(' number of empty documents .............. {}'.format(empty_docs))
print(' number of documents with one sentence .. {}'.format(skipped_docs))
print('****************************************************************\n')
return samples_np
'''
'''
# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):
def __init__(self, doc_idx, sizes, sentences):
self.doc_idx = doc_idx
self.num_docs = len(self.doc_idx) - 1
self.sizes = sizes
self.sentences = sentences
def __getitem__(self, idx):
return self.sentences[idx]
if __name__ == '__main__':
print('dataset ...')
from bert_tokenization import FullTokenizer
import json
import nltk
nltk.download('punkt')
def document_generator_provider(input_file):
with open(input_file, 'r') as ifile:
for document in ifile:
data = json.loads(document)
text = data['text']
sentences = []
for line in text.split('\n'):
if line != '\n':
sent = nltk.tokenize.sent_tokenize(line)
if sent:
sentences.extend(sent)
yield sentences
input_file = 'test/samples_10000.json'
vocab_file = 'test/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
document_generator = document_generator_provider(input_file)
doc_idx = [0]
sizes = []
sentences_list = []
for sentences in document_generator:
num_sent = 0
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
if tokens:
ids = tokenizer.convert_tokens_to_ids(tokens)
if len(ids) == 0:
print('****************')
print(sentence)
print(tokens)
print(ids)
print('****************')
sizes.append(len(ids))
sentences_list.append(ids)
num_sent += 1
doc_idx.append(num_sent)
for i in range(1, len(doc_idx)):
doc_idx[i] += doc_idx[i-1]
#max_size = np.iinfo(np.int32).max // 32
import time
docs_np = np.array(doc_idx, dtype=np.uint32)
sizes_np = np.array(sizes, dtype=np.uint16)
start_time = time.time()
max_seq_length = 512
max_size = docs_np.shape[0]
lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
size=max_size//10, dtype=np.uint16)
lens_view = lens[:max_size//10]
np.copyto(lens_view, lens_rand)
np.random.shuffle(lens)
print('num docs', max_size)
print('lens time', time.time() - start_time)
import helpers
start_time = time.time()
maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
print('maps time', time.time() - start_time)
print(maps)
exit()
start_time = time.time()
max_size = 10 #np.iinfo(np.int32).max 32
docs = np.arange(10, dtype=np.uint32)
print(docs)
a = example.doit(docs, max_size)
print(type(a))
print(a.shape)
print(a)
print(time.time() - start_time)
exit()
#start_time = time.time()
count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
print(count)
maps = maps[:count]
np.random.shuffle(maps)
print(time.time() - start_time)
exit()
indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
tokenizer=tokenizer,
num_epochs=10,
masked_lm_prob=0.15,
max_seq_length=512,
short_seq_prob=0.1,
seed=1234)
'''
"""TO BE ADDED""" # coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections import collections
...@@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, ...@@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
loss_mask_np = np.array(loss_mask, dtype=np.int64) loss_mask_np = np.array(loss_mask, dtype=np.int64)
return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
'''
if __name__ == '__main__':
print('building the dataset ...')
from bert_tokenization import FullTokenizer
import json
import nltk
nltk.download('punkt')
def document_generator_provider(input_file):
with open(input_file, 'r') as ifile:
for document in ifile:
data = json.loads(document)
text = data['text']
sentences = []
for line in text.split('\n'):
if line != '\n':
sentences.extend(nltk.tokenize.sent_tokenize(line))
yield sentences
input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
document_generator = document_generator_provider(input_file)
samples = []
sizes = []
for sentences in document_generator:
tokens_list = []
size = 0
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
tokens_list.append(tokens)
size += len(tokens)
samples.append(tokens_list)
sizes.append(size)
print(sizes)
import random
rng = random.Random(123567)
vocab_id_list = list(tokenizer.inv_vocab.keys())
cls_id = tokenizer.vocab['[CLS]']
sep_id = tokenizer.vocab['[SEP]']
mask_id = tokenizer.vocab['[MASK]']
pad_id = tokenizer.vocab['[PAD]']
vocab_id_to_token_dict = tokenizer.inv_vocab
sample = []
for s in samples[0]:
sample.append(tokenizer.convert_tokens_to_ids(s))
max_seq_length = 512
target_seq_length = 444
masked_lm_prob = 0.15
example = build_training_sample(sample,
target_seq_length, max_seq_length,
vocab_id_list, vocab_id_to_token_dict,
cls_id, sep_id, mask_id, pad_id,
masked_lm_prob, rng)
orig_tokens = []
for s in samples[0]:
orig_tokens.extend(s)
is_random = example['is_random']
if is_random:
print('random')
else:
print('not-random')
#exit()
ii = 0
for i in range(max_seq_length):
token = tokenizer.inv_vocab[example['text'][i]]
if token in ['[CLS]', '[SEP]'] :
orig_token = token
elif ii < len(orig_tokens):
orig_token = orig_tokens[ii]
ii += 1
else:
orig_token = 'EMPTY'
tokentype = example['types'][i]
label_id = example['labels'][i]
label = 'NONE'
if label_id >= 0:
label = tokenizer.inv_vocab[label_id]
loss_mask = example['loss_mask'][i]
padding_mask = example['padding_mask'][i]
string = ''
string += '{:15s}'.format(orig_token)
string += '{:15s}'.format(token)
string += '{:15s}'.format(label)
string += '{:5d}'.format(loss_mask)
string += '{:5d}'.format(tokentype)
string += '{:5d}'.format(padding_mask)
print(string)
'''
/*
coding=utf-8
Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/* Helper methods for fast index mapping builds */ /* Helper methods for fast index mapping builds */
#include <algorithm> #include <algorithm>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment