Commit 8ba76558 authored by Neel Kant's avatar Neel Kant
Browse files

Merge block-dataset into ict-stable

parents 1a3f5663 3dc39c4a
......@@ -42,6 +42,11 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
data_impl,
skip_warmup)
if ict_dataset:
titles_dataset = get_indexed_dataset_(data_prefix + '-titles',
data_impl,
skip_warmup)
# Get start and end indices of train/valid/train into doc-idx
# Note that doc-idx is desinged to be num-docs + 1 so we can
# easily iterate over it.
......@@ -78,7 +83,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
# Build the dataset accordingly.
kwargs = dict(
name=name,
indexed_dataset=indexed_dataset,
context_dataset=indexed_dataset,
data_prefix=data_prefix,
num_epochs=None,
max_num_samples=train_valid_test_num_samples[index],
......@@ -88,7 +93,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
)
if ict_dataset:
dataset = InverseClozeDataset(**kwargs)
dataset = InverseClozeDataset(titles_dataset=titles_dataset, **kwargs)
else:
dataset = BertDataset(masked_lm_prob=masked_lm_prob, **kwargs)
# Set the original pointer so dataset remains the main dataset.
......
......@@ -304,7 +304,211 @@ py::array build_mapping(const py::array_t<int64_t>& docs_,
}
}
template<typename DocIdx>
py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
const py::array_t<int32_t>& sizes_,
const py::array_t<int32_t>& titles_sizes_,
const int32_t num_epochs,
const uint64_t max_num_samples,
const int32_t max_seq_length,
const int32_t seed,
const bool verbose) {
/* Build a mapping of (start-index, end-index, sequence-length) where
start and end index are the indices of the sentences in the sample
and sequence-length is the target sequence length.
*/
// Consistency checks.
assert(num_epochs > 0);
assert(max_seq_length > 1);
assert(seed > 0);
// Remove bound checks.
auto docs = docs_.unchecked<1>();
auto sizes = sizes_.unchecked<1>();
auto titles_sizes = titles_sizes_.unchecked<1>();
if (verbose) {
const auto sent_start_index = docs[0];
const auto sent_end_index = docs[docs_.shape(0) - 1];
const auto num_sentences = sent_end_index - sent_start_index;
cout << " using:" << endl << std::flush;
cout << " number of documents: " << docs_.shape(0) - 1 <<
endl << std::flush;
cout << " sentences range: [" << sent_start_index <<
", " << sent_end_index << ")" << endl << std::flush;
cout << " total number of sentences: " << num_sentences <<
endl << std::flush;
cout << " number of epochs: " << num_epochs <<
endl << std::flush;
cout << " maximum number of samples: " << max_num_samples <<
endl << std::flush;
cout << " maximum sequence length: " << max_seq_length <<
endl << std::flush;
cout << " seed: " << seed << endl <<
std::flush;
}
// Mapping and its length (1D).
int64_t num_samples = -1;
DocIdx* maps = NULL;
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
bool second = false;
for (int32_t iteration=0; iteration<2; ++iteration) {
// Set the flag on second iteration.
second = (iteration == 1);
// Current map index.
uint64_t map_index = 0;
// For each epoch:
for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
if (map_index >= max_num_samples) {
if (verbose && (!second)) {
cout << " reached " << max_num_samples << " samples after "
<< epoch << " epochs ..." << endl << std::flush;
}
break;
}
// For each document:
for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
// Document sentences are in [sent_index_first, sent_index_last)
const auto sent_index_first = docs[doc];
const auto sent_index_last = docs[doc + 1];
const auto target_seq_len = max_seq_length - titles_sizes[doc];
// At the begining of the document previous index is the
// start index.
auto prev_start_index = sent_index_first;
// Remaining documents.
auto num_remain_sent = sent_index_last - sent_index_first;
// Detect documents with long sentences.
bool contains_long_sentence = false;
if (num_remain_sent > 1) {
for (auto sent_index=sent_index_first;
sent_index < sent_index_last; ++sent_index) {
if (sizes[sent_index] > LONG_SENTENCE_LEN){
contains_long_sentence = true;
break;
}
}
}
// If we have more than two sentences.
if ((num_remain_sent > 1) && (!contains_long_sentence)) {
// Set values.
auto seq_len = int32_t{0};
auto num_sent = int32_t{0};
// Loop through sentences.
for (auto sent_index=sent_index_first;
sent_index < sent_index_last; ++sent_index) {
// Add the size and number of sentences.
seq_len += sizes[sent_index];
++num_sent;
--num_remain_sent;
// If we have reached the target length.
// and if not only one sentence is left in the document.
// and if we have at least two sentneces.
// or if we have reached end of the document.
if (((seq_len >= target_seq_len) &&
(num_remain_sent > 1) &&
(num_sent > 1) ) || (num_remain_sent == 0)) {
// Populate the map.
if (second) {
const auto map_index_0 = 3 * map_index;
maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
}
// Update indices / counters.
++map_index;
prev_start_index = sent_index + 1;
seq_len = 0;
num_sent = 0;
}
} // for (auto sent_index=sent_index_first; ...
} // if (num_remain_sent > 1) {
} // for (int doc=0; doc < num_docs; ++doc) {
} // for (int epoch=0; epoch < num_epochs; ++epoch) {
if (!second) {
if (verbose) {
cout << " will create mapping for " << map_index <<
" samples" << endl << std::flush;
}
assert(maps == NULL);
assert(num_samples < 0);
maps = new DocIdx[3*map_index];
num_samples = static_cast<int64_t>(map_index);
}
} // for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
// We need a 64 bit random number generator as we might have more
// than 2 billion samples.
std::mt19937_64 rand64_gen(seed + 1);
for (auto i=(num_samples - 1); i > 0; --i) {
const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
const auto i0 = 3 * i;
const auto j0 = 3 * j;
// Swap values.
swap(maps[i0], maps[j0]);
swap(maps[i0 + 1], maps[j0 + 1]);
swap(maps[i0 + 2], maps[j0 + 2]);
}
// Method to deallocate memory.
py::capsule free_when_done(maps, [](void *mem_) {
DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
delete[] mem;
});
// Return the numpy array.
const auto byte_size = sizeof(DocIdx);
return py::array(std::vector<int64_t>{num_samples, 3}, // shape
{3*byte_size, byte_size}, // C-style contiguous strides
maps, // the data pointer
free_when_done); // numpy array references
}
py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
const py::array_t<int>& sizes_,
const py::array_t<int>& titles_sizes_,
const int num_epochs,
const uint64_t max_num_samples,
const int max_seq_length,
const int seed,
const bool verbose) {
if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
if (verbose) {
cout << " using uint64 for data mapping..." << endl << std::flush;
}
return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
num_epochs, max_num_samples, max_seq_length, seed, verbose);
} else {
if (verbose) {
cout << " using uint32 for data mapping..." << endl << std::flush;
}
return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
num_epochs, max_num_samples, max_seq_length, seed, verbose);
}
}
PYBIND11_MODULE(helpers, m) {
m.def("build_mapping", &build_mapping);
m.def("build_blocks_mapping", &build_blocks_mapping);
}
import itertools
import random
import os
import sys
import time
import numpy as np
......@@ -11,17 +13,28 @@ from megatron import print_rank_0
from megatron import mpu
from megatron.data import helpers
class InverseClozeDataset(Dataset):
"""Dataset containing sentences and various 'blocks' for an inverse cloze task."""
def __init__(self, name, indexed_dataset, data_prefix,
"""Dataset containing sentences and their blocks for an inverse cloze task."""
def __init__(self, name, context_dataset, titles_dataset, data_prefix,
num_epochs, max_num_samples, max_seq_length,
short_seq_prob, seed):
self.name = name
self.seed = seed
self.max_seq_length = max_seq_length
self.indexed_dataset = indexed_dataset
self.context_dataset = context_dataset
self.titles_dataset = titles_dataset
self.short_seq_prob = short_seq_prob
self.rng = random.Random(self.seed)
self.samples_mapping = get_samples_mapping(self.context_dataset,
self.titles_dataset,
data_prefix,
num_epochs,
max_num_samples,
self.max_seq_length,
self.seed,
self.name)
tokenizer = get_tokenizer()
self.vocab_id_list = list(tokenizer.inv_vocab.keys())
self.vocab_id_to_token_list = tokenizer.inv_vocab
......@@ -29,23 +42,35 @@ class InverseClozeDataset(Dataset):
self.sep_id = tokenizer.sep
self.mask_id = tokenizer.mask
self.pad_id = tokenizer.pad
self.offset = 0
def __len__(self):
return self.indexed_dataset.doc_idx.shape[0]
return self.samples_mapping.shape[0]
def __getitem__(self, idx):
# get rng state corresponding to index (allows deterministic random pair)
rng = random.Random(idx + 20000 + self.seed)
start_idx, end_idx, doc_idx = self.samples_mapping[idx]
title = list(self.titles_dataset[int(doc_idx)])
context = [list(self.context_dataset[i]) for i in range(start_idx, end_idx)]
assert len(context) > 1
# avoid selecting the first or last sentence to be the query.
if len(context) == 2:
rand_sent_idx = int(self.rng.random() > 0.5)
else:
rand_sent_idx = self.rng.randint(1, len(context) - 2)
# keep the query in the context 10% of the time.
if self.rng.random() < 0.1:
input = context[rand_sent_idx].copy()
else:
input = context.pop(rand_sent_idx)
# get seq length. Save 2 tokens for beginning and end
target_seq_length = self.max_seq_length - 2
if rng.random() < self.short_seq_prob:
target_seq_length = rng.randint(5, target_seq_length)
# may still need to truncate because blocks are concluded when
# the sentence lengths have exceeded max_seq_length.
input = input[:self.max_seq_length - 2]
context = list(itertools.chain(*context))[:self.max_seq_length - (3 + len(title))]
input_data, context_data = self.get_input_and_context(idx, target_seq_length, rng)
input_tokens, input_token_types, input_pad_mask = input_data
context_tokens, context_token_types, context_pad_mask = context_data
input_tokens, input_token_types, input_pad_mask = self.concat_and_pad_tokens(input)
context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens(context, title)
sample = {
'input_text': np.array(input_tokens),
......@@ -58,20 +83,12 @@ class InverseClozeDataset(Dataset):
return sample
def get_sentence_split_doc(self, idx):
"""fetch document at index idx and split into sentences"""
doc_start = self.indexed_dataset.doc_idx[idx]
doc_end = self.indexed_dataset.doc_idx[idx + 1]
doc_sentences_array = self.indexed_dataset[doc_start:doc_end]
doc_sentences = [list(arr) for arr in doc_sentences_array]
return doc_sentences
def concat_and_pad_tokens(self, tokens):
def concat_and_pad_tokens(self, tokens, title=None):
"""concat with special tokens and pad sequence to self.max_seq_length"""
tokens = [self.cls_id] + tokens + [self.sep_id]
assert len(tokens) <= self.max_seq_length
if title is not None:
tokens += title + [self.sep_id]
assert len(tokens) <= self.max_seq_length, len(tokens)
num_pad = self.max_seq_length - len(tokens)
pad_mask = [0] * len(tokens) + [1] * num_pad
......@@ -79,65 +96,82 @@ class InverseClozeDataset(Dataset):
token_types = [0] * self.max_seq_length
return tokens, token_types, pad_mask
def get_input_and_context(self, idx, target_seq_length, rng):
"""fetches a sentence and its surrounding context"""
num_tries = 0
while num_tries < 20:
num_tries += 1
doc = None
while doc is None:
doc = self.get_sentence_split_doc(idx + self.offset)
if not doc:
doc = None
self.offset += 1
num_sentences = len(doc)
padless_max_len = self.max_seq_length - 2
# select a random sentence from the document as input
# TODO: consider adding multiple input sentences.
input_sentence_idx = rng.randint(0, num_sentences - 1)
input_tokens = doc[input_sentence_idx][:target_seq_length]
if not len(input_tokens) > 0:
self.offset += 1
continue
context_tokens = []
# 10% of the time, the input sentence is left in the context.
# The other 90% of the time, keep it out.
if rng.random() < 0.1:
context_tokens = input_tokens.copy()
view_preceding = True
view_radius = 1
while len(context_tokens) < padless_max_len:
# keep adding sentences while the context can accommodate more.
if view_preceding:
examine_idx = input_sentence_idx - view_radius
if examine_idx >= 0:
new_tokens = doc[examine_idx]
context_tokens = new_tokens + context_tokens
else:
examine_idx = input_sentence_idx + view_radius
if examine_idx < num_sentences:
new_tokens = doc[examine_idx]
context_tokens += new_tokens
view_radius += 1
view_preceding = not view_preceding
if view_radius > num_sentences:
break
# assemble the tokens and token types of the context
context_tokens = context_tokens[:padless_max_len]
if not len(context_tokens) > 0:
self.offset += 1
continue
# concatenate 'CLS' and 'SEP' tokens and add extra token types
input_tokens, input_token_types, input_pad_mask = self.concat_and_pad_tokens(input_tokens)
context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens(context_tokens)
return (input_tokens, input_token_types, input_pad_mask), \
(context_tokens, context_token_types, context_pad_mask)
else:
raise RuntimeError("Could not get a valid data point from InverseClozeDataset")
def get_samples_mapping(context_dataset,
titles_dataset,
data_prefix,
num_epochs,
max_num_samples,
max_seq_length,
seed,
name):
if not num_epochs:
if not max_num_samples:
raise ValueError("Need to specify either max_num_samples "
"or num_epochs")
num_epochs = np.iinfo(np.int32).max - 1
if not max_num_samples:
max_num_samples = np.iinfo(np.int64).max - 1
# Filename of the index mapping
indexmap_filename = data_prefix
indexmap_filename += '_{}_indexmap'.format(name)
if num_epochs != (np.iinfo(np.int32).max - 1):
indexmap_filename += '_{}ep'.format(num_epochs)
if max_num_samples != (np.iinfo(np.int64).max - 1):
indexmap_filename += '_{}mns'.format(max_num_samples)
indexmap_filename += '_{}msl'.format(max_seq_length)
indexmap_filename += '_{}s'.format(seed)
indexmap_filename += '.npy'
# Build the indexed mapping if not exist.
if torch.distributed.get_rank() == 0 and \
not os.path.isfile(indexmap_filename):
print(' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'.format(indexmap_filename))
# Make sure the types match the helpers input types.
assert context_dataset.doc_idx.dtype == np.int64
assert context_dataset.sizes.dtype == np.int32
# Build samples mapping
verbose = torch.distributed.get_rank() == 0
start_time = time.time()
print_rank_0(' > building samples index mapping for {} ...'.format(
name))
samples_mapping = helpers.build_blocks_mapping(
context_dataset.doc_idx,
context_dataset.sizes,
titles_dataset.sizes,
num_epochs,
max_num_samples,
max_seq_length-3, # account for added tokens
seed,
verbose)
print_rank_0(' > done building samples index mapping')
np.save(indexmap_filename, samples_mapping, allow_pickle=True)
print_rank_0(' > saved the index mapping in {}'.format(
indexmap_filename))
# Make sure all the ranks have built the mapping
print_rank_0(' > elapsed time to build and save samples mapping '
'(seconds): {:4f}'.format(
time.time() - start_time))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts = torch.cuda.LongTensor([1])
torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
assert counts[0].item() == torch.distributed.get_world_size(
group=mpu.get_data_parallel_group())
# Load indexed dataset.
print_rank_0(' > loading indexed mapping from {}'.format(
indexmap_filename))
start_time = time.time()
samples_mapping = np.load(indexmap_filename, allow_pickle=True)
print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
time.time() - start_time))
print_rank_0(' total number of samples: {}'.format(
samples_mapping.shape[0]))
return samples_mapping
import argparse
import itertools
import json
import multiprocessing
import nltk
......@@ -43,18 +44,28 @@ class Encoder(object):
def encode(self, json_line):
text = json.loads(json_line)[self.args.json_key]
if not text:
text = "no text"
doc_ids = []
for sentence in Encoder.splitter.tokenize(text):
tokens = Encoder.tokenizer.tokenize(sentence)
ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
if len(ids) > 0:
doc_ids.append(ids)
else:
print("no ids!", flush=True)
tokens = Encoder.tokenizer.tokenize("no text")
ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
doc_ids.append(ids)
if self.args.flatten and len(doc_ids) > 1:
doc_ids = [list(itertools.chain(*doc_ids))]
return doc_ids, len(json_line)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, help='Path to input JSON')
parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
parser.add_argument('--flatten', action='store_true', help='Path to input JSON')
parser.add_argument('--json-key', type=str, default='text',
help='Key to extract from json')
parser.add_argument('--output-prefix', type=str, help='Path to binary output file without suffix')
......
......@@ -24,7 +24,7 @@ from megatron import get_adlr_autoresume
from megatron import mpu
from megatron import print_rank_0
from megatron.checkpointing import save_checkpoint
from megatron.data.samplers import DistributedBatchSampler, RandomSampler
from megatron.data.samplers import DistributedBatchSampler
from megatron.fp16 import FP16_Optimizer
......@@ -102,16 +102,12 @@ def make_data_loader(dataset):
num_workers = args.num_workers
# Use a simple sampler with distributed batch sampler.
#sampler = torch.utils.data.SequentialSampler(dataset)
sampler = RandomSampler(dataset,
replacement=True,
num_samples=global_batch_size*args.train_iters)
sampler = torch.utils.data.SequentialSampler(dataset)
batch_sampler = DistributedBatchSampler(sampler=sampler,
batch_size=global_batch_size,
drop_last=True,
rank=rank,
world_size=world_size,
wrap_last=True)
world_size=world_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
......
......@@ -102,7 +102,7 @@ def get_train_val_test_data():
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
args = get_args()
(train_data, val_data, test_data) = (None, None, None)
(train_data, valid_data, test_data) = (None, None, None)
# Data loader only on rank 0 of each model parallel group.
if mpu.get_model_parallel_rank() == 0:
......@@ -115,7 +115,7 @@ def get_train_val_test_data():
# Number of train/valid/test samples.
train_iters = args.train_iters
eval_iters = args.eval_iters
eval_iters = (train_iters // args.eval_iters + 1) * args.eval_iters
test_iters = args.eval_iters
train_val_test_num_samples = [train_iters * global_batch_size,
eval_iters * global_batch_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment