"vscode:/vscode.git/clone" did not exist on "1fa6990c498b91ee08936196a4c6f021c800798e"
Commit 0058b1a5 authored by Jared Casper's avatar Jared Casper
Browse files

Plugging new dataset stuff together.

Some cleanup of types, etc. in the helpers cpp code.
parent b1714c14
CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
CPPFLAGS += $(shell python3 -m pybind11 --includes)
LIBNAME = helpers
LIBEXT = $(shell python3-config --extension-suffix)
default: $(LIBNAME)$(LIBEXT)
%$(LIBEXT): %.cpp
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
from . import indexed_dataset
from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .dataset import AlbertDataset
......@@ -7,27 +7,36 @@ import numpy as np
import torch
from torch.utils.data import Dataset
from dataset_utils import build_training_sample
from .dataset_utils import build_training_sample
#from data.mapping import build_training_samples_mapping
class AlbertDataSet(Dataset):
from . import helpers
from megatron.data import FullBertTokenizer, indexed_dataset
def __init__(self, indexed_dataset, tokenizer, num_epochs,
class AlbertDataset(Dataset):
def __init__(self, indexed_dataset, tokenizer, num_epochs, max_num_samples,
masked_lm_prob, max_seq_length, short_seq_prob, seed):
# Params to store.
self.seed = seed
self.masked_lm_prob = masked_lm_prob
self.max_seq_length = max_seq_length
self.tokenizer = tokenizer
# Indexed dataset.
self.indexed_dataset = indexed_dataset
# Build the samples mapping.
self.samples_mapping = build_training_samples_mapping(
indexed_dataset,
if not max_num_samples:
max_num_samples = len(indexed_dataset) * num_epochs
self.samples_mapping = helpers.build_mapping(
indexed_dataset.doc_idx,
indexed_dataset.sizes,
num_epochs,
self.max_seq_length,
max_num_samples,
self.max_seq_length-3, # account for added tokens
short_seq_prob,
self.seed)
......@@ -40,8 +49,17 @@ class AlbertDataSet(Dataset):
self.pad_id = tokenizer.vocab['[PAD]']
@classmethod
def from_paths(cls, vocab, data_prefix, data_impl,
num_epochs, max_num_samples, masked_lm_prob,
max_seq_length, short_seq_prob, seed):
tokenizer = FullBertTokenizer(vocab, do_lower_case=True)
idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl)
return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob,
max_seq_length, short_seq_prob, seed)
def __len__(self):
return self.samples.shape[0]
return self.samples_mapping.shape[0]
def __getitem__(self, idx):
rng = random.Random(self.seed + idx)
......@@ -49,6 +67,9 @@ class AlbertDataSet(Dataset):
sample = []
for index in range(start_index, end_index):
sample.append(self.indexed_dataset[index])
for s in sample:
if len(s) > 1000:
print(self.tokenizer.convert_ids_to_tokens(s))
return build_training_sample(sample, seq_length,
self.max_seq_length,
self.vocab_id_list,
......@@ -186,7 +207,6 @@ class JaredDataset(object):
if __name__ == '__main__':
print('dataset ...')
from bert_tokenization import FullTokenizer
......@@ -207,8 +227,8 @@ if __name__ == '__main__':
sentences.extend(sent)
yield sentences
input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
input_file = 'test/samples_10000.json'
vocab_file = 'test/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
document_generator = document_generator_provider(input_file)
......
......@@ -35,9 +35,8 @@ def build_training_sample(sample,
tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)
# Truncate to `target_sequence_length`.
# Note that we have account for [CLS] A [SEP] B [SEP]
max_num_tokens = target_seq_length - 3
truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
max_num_tokens = target_seq_length
truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
max_num_tokens, rng)
# Build tokens and toketypes.
......@@ -48,7 +47,7 @@ def build_training_sample(sample,
max_predictions_per_seq = masked_lm_prob * max_num_tokens
(tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
cls_id, sep_id, mask_id, max_predictions_per_seq)
cls_id, sep_id, mask_id, max_predictions_per_seq, rng)
# Padding.
tokens_np, tokentypes_np, labels, padding_mask, loss_mask \
......@@ -61,7 +60,8 @@ def build_training_sample(sample,
'labels': labels,
'is_random': int(is_next_random),
'loss_mask': loss_mask,
'padding_mask': padding_mask}
'padding_mask': padding_mask,
'truncated': int(truncated)}
return train_sample
......@@ -99,11 +99,12 @@ def get_a_and_b_segments(sample, rng):
def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
#print(len_a, len_b, max_num_tokens)
assert len_a > 0
assert len_b > 0
if (len_a + len_b) <= max_num_tokens:
return
else:
if len_a + len_b <= max_num_tokens:
return False
while len_a + len_b > max_num_tokens:
if len_a > len_b:
len_a -= 1
tokens = tokens_a
......@@ -114,8 +115,7 @@ def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
del tokens[0]
else:
tokens.pop()
truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng)
return True
def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
"""Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
......@@ -161,6 +161,7 @@ def create_masked_lm_predictions(tokens,
masked_lm_prob,
cls_id, sep_id, mask_id,
max_predictions_per_seq,
rng,
max_ngrams=3,
do_whole_word_mask=True,
favor_longer_ngram=False,
......@@ -468,4 +469,3 @@ if __name__ == '__main__':
string += '{:5d}'.format(tokentype)
string += '{:5d}'.format(padding_mask)
print(string)
......@@ -3,6 +3,7 @@
#include <iostream>
#include <limits>
#include <math.h>
#include <stdexcept>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
......@@ -20,11 +21,11 @@ inline uint32_t get_sample_len(const int short_seq_ratio,
return max_length;
}
py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
template<typename DocIdx>
py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
const py::array_t<uint16_t>& sizes_,
const int num_epochs,
const int max_num_samples,
const uint64_t max_num_samples,
const int max_seq_length,
const double short_seq_prob,
const int seed) {
......@@ -33,7 +34,7 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
" documents with " << sizes_.shape(0) << " sentences ..." << endl;
// For efficiency, convert probability to ratio.
const int short_seq_ratio = int(round(1.0 / short_seq_prob));
const auto short_seq_ratio = static_cast<int>(round(1.0 / short_seq_prob));
// Remove bound checks.
auto docs = docs_.unchecked<1>();
......@@ -47,8 +48,8 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
}
// Mapping and it's length (1D).
int num_samples = -1;
uint32_t* maps = NULL;
int64_t num_samples = -1;
DocIdx* maps = NULL;
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
......@@ -59,9 +60,7 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
srand(seed);
// Set the flag on second iteration.
if (iteration == 1) {
second = true;
}
second = iteration == 1;
// Counters:
uint32_t empty_docs = 0;
......@@ -72,7 +71,7 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
// For each epoch:
for (int epoch=0; epoch < num_epochs; ++epoch) {
if (map_index >= max_num_samples) {
if (map_index >= max_num_samples && !second) {
cout << " > reached " << max_num_samples << " samples after " <<
epoch << " epochs ..." << endl;
break;
......@@ -81,14 +80,14 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {
// Document sentences are in [sent_index_first, sent_index_last).
const uint32_t sent_index_first = docs[doc];
const uint32_t sent_index_last = docs[doc + 1];
const auto sent_index_first = docs[doc];
const auto sent_index_last = docs[doc + 1];
// At the begining of the document previous index is the start index.
uint32_t prev_start_index = sent_index_first;
auto prev_start_index = sent_index_first;
// Remaining documents.
uint32_t num_remain_sent = sent_index_last - sent_index_first;
auto num_remain_sent = sent_index_last - sent_index_first;
// Some bookkeeping
if ((epoch == 0) && (!second)) {
......@@ -107,12 +106,12 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
if (num_remain_sent > 1) {
// Set values.
uint32_t size = 0;
uint32_t num_sent = 0;
uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length);
auto size = uint32_t{0};
auto num_sent = uint32_t{0};
auto seq_len = get_sample_len(short_seq_ratio, max_seq_length);
// Loop through sentences.
for (uint32_t sent_index=sent_index_first;
for (auto sent_index=sent_index_first;
sent_index < sent_index_last; ++sent_index) {
// Add the size and number of sentences.
......@@ -129,13 +128,19 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
// Populate the map.
if (second) {
const uint64_t map_index_0 = 3 * map_index;
const auto map_index_0 = 3 * map_index;
maps[map_index_0] = prev_start_index;
maps[map_index_0 + 1] = sent_index + 1;
maps[map_index_0 + 2] = seq_len;
}
// Update indices / counters.
// check for overflow
if (map_index == std::numeric_limits<DocIdx>::max()) {
cout << "number of samples exceeded maximum allowed by type: "
<< std::numeric_limits<DocIdx>::max() << endl;
throw std::overflow_error("Number of samples");
}
map_index += 1;
prev_start_index = sent_index + 1;
seq_len = get_sample_len(short_seq_ratio, max_seq_length);
......@@ -148,29 +153,24 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
} // for (int doc=0; doc < num_docs; ++doc) {
} // for (int epoch=0; epoch < num_epochs; ++epoch) {
// For now only support mappings up to MAX_INT.
if (map_index > std::numeric_limits<int>::max()) {
cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl;
throw(-1);
}
else if (!second) {
if (!second) {
cout << " number of samples: " <<
map_index << endl;
cout << " number of empty documents: " <<
empty_docs << endl;
cout << " number of documents with one sentence: " <<
one_sent_docs << endl;
maps = new uint32_t[3*map_index];
num_samples = int(map_index);
maps = new DocIdx[3*map_index];
num_samples = map_index;
}
} // for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
for (int i=(num_samples - 1); i > 0; --i) {
const int j = rand() % (i + 1);
uint64_t i0 = 3 * i;
uint64_t j0 = 3 * j;
for (auto i=(num_samples - 1); i > 0; --i) {
const auto j = rand() % (i + 1);
const auto i0 = 3 * i;
const auto j0 = 3 * j;
// Swap values.
swap(maps[i0], maps[j0]);
swap(maps[i0 + 1], maps[j0 + 1]);
......@@ -181,22 +181,35 @@ py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
// Method to deallocate memory.
py::capsule free_when_done(maps, [](void *mem_) {
uint32_t *mem = reinterpret_cast<uint32_t *>(mem_);
DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
cout << "freeing memory for the dataset mapping" << endl;
delete[] mem;
});
// Return the numpy array.
return py::array_t<uint32_t>({num_samples, 3}, // shape
return py::array(std::vector<int64_t>{num_samples, 3}, // shape
{3*4, 4}, // C-style contiguous strides
maps, // the data pointer
free_when_done); // numpy array references
}
py::array build_mapping(const py::array& docs_,
const py::array& sizes_,
const int num_epochs,
const uint64_t max_num_samples,
const int max_seq_length,
const double short_seq_prob,
const int seed) {
if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples,
max_seq_length, short_seq_prob, seed);
} else {
return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples,
max_seq_length, short_seq_prob, seed);
}
}
PYBIND11_MODULE(helpers, m) {
m.def("build_mapping", &build_mapping);
}
......@@ -458,7 +458,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
if self._index.dtype != np.int64:
np_array = np_array.astype(np.int64)
return torch.from_numpy(np_array)
return np_array
elif isinstance(idx, slice):
start, stop, step = idx.indices(len(self))
if step != 1:
......
......@@ -7,7 +7,7 @@ import torch
script_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(script_dir, "../../../"))
from megatron.data import indexed_dataset, FullBertTokenizer
from megatron.data import indexed_dataset, FullBertTokenizer, AlbertDataset
def test_indexed_dataset(args):
ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
......@@ -31,18 +31,47 @@ def test_indexed_dataset(args):
print("Newline in string!")
print(i)
def test_albert_dataset(args):
# tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
# idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
# ds = AlbertDataset(idataset, tokenizer)
ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
args.epochs, args.max_num_samples,
args.masked_lm_prob, args.seq_length,
args.short_seq_prob, args.seed)
truncated = 0
total = 0
for s in ds:
ids = s['text']
tokens = ds.tokenizer.convert_ids_to_tokens(ids)
print(tokens)
exit()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, help='prefix to data files')
parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
parser.add_argument('--dataset-impl', type=str, default='infer',
choices=['lazy', 'cached', 'mmap', 'infer'])
parser.add_argument('--epochs', type=int, default=5,
help='Number of epochs to plan for')
parser.add_argument('--max-num-samples', type=int, default=None,
help='Maximum number of samples to plan for')
parser.add_argument('--masked-lm-prob', type=float, default=0.15,
help='probability of masking tokens')
parser.add_argument('--seq-length', type=int, default=512,
help='maximum sequence length')
parser.add_argument('--short-seq-prob', type=float, default=0.1,
help='probability of creating a short sequence')
parser.add_argument('--seed', type=int, default=1234,
help='random seed')
args = parser.parse_args()
if args.dataset_impl == "infer":
args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
test_indexed_dataset(args)
test_albert_dataset(args)
# test_indexed_dataset(args)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment