Commit 75bd9b54 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'megatron_sampler' into 'main'

Simplified sampler (will be needed later for batch size increase) and removed deprecated data stuff

See merge request ADLR/megatron-lm!177
parents ea81d62f fac6718a
......@@ -72,6 +72,9 @@ def parse_args(extra_args_provider=None, defaults={},
print('using {} for parameters ...'.format(args.params_dtype),
flush=True)
# Consumed tokens.
args.consumed_train_samples = 0
args.consumed_valid_samples = 0
# Set input defaults.
for key in defaults:
......@@ -133,14 +136,16 @@ def parse_args(extra_args_provider=None, defaults={},
def _print_args(args):
"""Print arguments."""
if args.rank == 0:
print('-------------------- arguments --------------------', flush=True)
print('------------------------ arguments ------------------------',
flush=True)
str_list = []
for arg in vars(args):
dots = '.' * (32 - len(arg))
dots = '.' * (48 - len(arg))
str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg)))
for arg in sorted(str_list, key=lambda x: x.lower()):
print(arg, flush=True)
print('---------------- end of arguments ----------------', flush=True)
print('-------------------- end of arguments ---------------------',
flush=True)
def _check_arg_is_not_none(args, arg):
......@@ -275,7 +280,7 @@ def _add_learning_rate_args(parser):
'and initial warmup, the learing rate at each '
'iteration would be different.')
group.add_argument('--lr-decay-style', type=str, default='linear',
choices=['constant', 'linear', 'cosine', 'exponential'],
choices=['constant', 'linear', 'cosine'],
help='Learning rate decay function.')
group.add_argument('--lr-decay-iters', type=int, default=None,
help='number of iterations to decay learning rate over,'
......@@ -397,8 +402,11 @@ def _add_validation_args(parser):
def _add_data_args(parser):
group = parser.add_argument_group(title='data and dataloader')
group.add_argument('--data-path', type=str, default=None,
help='Path to combined dataset to split.')
group.add_argument('--data-path', nargs='*', default=None,
help='Path to the training dataset. Accepted format:'
'1) a single data path, 2) multiple datasets in the'
'form: dataset1-weight dataset1-path dataset2-weight '
'dataset2-path ...')
group.add_argument('--split', type=str, default='969, 30, 1',
help='Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
......
......@@ -213,12 +213,15 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
'iteration from checkpoint {}, exiting'.format(
checkpoint_name))
sys.exit()
# Check arguments.
assert args.consumed_train_samples == 0
assert args.consumed_valid_samples == 0
if 'args' in state_dict:
checkpoint_args = state_dict['args']
check_checkpoint_args(checkpoint_args)
args.consumed_train_samples = getattr(args, 'consumed_train_samples', 0)
args.consumed_valid_samples = getattr(args, 'consumed_valid_samples', 0)
else:
print_rank_0('could not find arguments in the checkpoint ...')
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Blendable dataset."""
import time
import numpy as np
import torch
from megatron import print_rank_0
from megatron import mpu
class BlendableDataset(torch.utils.data.Dataset):
def __init__(self, datasets, weights):
self.datasets = datasets
num_datasets = len(datasets)
assert num_datasets == len(weights)
self.size = 0
for dataset in self.datasets:
self.size += len(dataset)
# Normalize weights.
weights = np.array(weights, dtype=np.float64)
sum_weights = np.sum(weights)
assert sum_weights > 0.0
weights /= sum_weights
# Build indecies.
start_time = time.time()
assert num_datasets < 255
self.dataset_index = np.zeros(self.size, dtype=np.uint8)
self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
if torch.distributed.get_rank() == 0:
from megatron.data.dataset_utils import compile_helper
compile_helper()
# Simple barrier
tmp = torch.cuda.LongTensor([1])
torch.distributed.all_reduce(tmp, group=mpu.get_data_parallel_group())
from megatron.data import helpers
helpers.build_blending_indices(self.dataset_index,
self.dataset_sample_index,
weights, num_datasets, self.size,
torch.distributed.get_rank() == 0)
print_rank_0('> elapsed time for building blendable dataset indices: '
'{:.2f} (sec)'.format(time.time() - start_time))
def __len__(self):
return self.size
def __getitem__(self, idx):
dataset_idx = self.dataset_index[idx]
sample_idx = self.dataset_sample_index[idx]
return self.datasets[dataset_idx][sample_idx]
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataloaders."""
import torch
from megatron import get_args
from megatron import mpu
def build_pretraining_data_loader(dataset, consumed_samples):
"""Buld dataloader given an input dataset."""
if dataset is None:
return None
args = get_args()
world_size = mpu.get_data_parallel_world_size()
global_batch_size = args.batch_size * world_size
# Megatron sampler
batch_sampler = MegatronPretrainingSampler(
total_samples=len(dataset),
consumed_samples=consumed_samples,
global_batch_size=global_batch_size,
rank=mpu.get_data_parallel_rank(),
world_size=world_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=args.num_workers,
pin_memory=True)
class MegatronPretrainingSampler:
def __init__(self, total_samples, consumed_samples,
global_batch_size, rank, world_size):
# Keep a copy of input params for later use.
self.total_samples = total_samples
self.consumed_samples = consumed_samples
self.global_batch_size = global_batch_size
self.rank = rank
# Sanity checks.
assert self.total_samples > 0, \
'no sample to consume: {}'.format(self.total_samples)
assert self.consumed_samples < self.total_samples, \
'no samples left to consume: {}, {}'.format(self.consumed_samples,
self.total_samples)
assert self.global_batch_size > 0, \
'Unexpected global batch size: {}'.format(self.global_batch_size)
assert world_size > 0,\
'non zero world size is expected: {}'.format(world_size)
assert self.rank < world_size,\
'rank should be smaller than world size: {}, {}'.format(
self.rank, world_size)
# Batch size per rank.
assert self.global_batch_size % world_size == 0,\
'global batch size must be divisible by world size: {}, {}'.format(
self.global_batch_size, world_size)
self.batch_size_per_rank = self.global_batch_size // world_size
def __len__(self):
return self.total_samples
def __iter__(self):
batch = []
# Last batch if not complete will be dropped.
for idx in range(self.consumed_samples, self.total_samples):
batch.append(idx)
if len(batch) == self.global_batch_size:
start_idx = self.rank * self.batch_size_per_rank
end_idx = start_idx + self.batch_size_per_rank
yield batch[start_idx:end_idx]
batch = []
......@@ -18,11 +18,13 @@
# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
# with some modifications.
import math
import time
import collections
import numpy as np
from megatron import get_args, print_rank_0
from megatron.data.blendable_dataset import BlendableDataset
from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
DSET_TYPE_STD = 'standard_bert'
......@@ -31,6 +33,38 @@ DSET_TYPE_ICT = 'ict'
DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]
def get_datasets_weights_and_num_samples(data_prefix,
train_valid_test_num_samples):
# The data prefix should be in the format of:
# weight-1, data-prefix-1, weight-2, data-prefix-2, ..
assert len(data_prefix) % 2 == 0
num_datasets = len(data_prefix) // 2
weights = [0]*num_datasets
prefixes = [0]*num_datasets
for i in range(num_datasets):
weights[i] = float(data_prefix[2*i])
prefixes[i] = (data_prefix[2*i+1]).strip()
# Normalize weights
weight_sum = 0.0
for weight in weights:
weight_sum += weight
assert weight_sum > 0.0
weights = [weight / weight_sum for weight in weights]
# Add 0.5% (the 1.005 factor) so in case the bleding dataset does
# not uniformly distribute the number of samples, we still have
# samples left to feed to the network.
datasets_train_valid_test_num_samples = []
for weight in weights:
datasets_train_valid_test_num_samples.append(
[int(math.ceil(val * weight * 1.005))
for val in train_valid_test_num_samples])
return prefixes, weights, datasets_train_valid_test_num_samples
def compile_helper():
"""Compile helper function ar runtime. Make sure this
is invoked on a single process."""
......@@ -360,6 +394,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
short_seq_prob, seed, skip_warmup,
dataset_type='standard_bert'):
if len(data_prefix) == 1:
return _build_train_valid_test_datasets(data_prefix[0],
data_impl, splits_string,
train_valid_test_num_samples,
max_seq_length, masked_lm_prob,
short_seq_prob, seed,
skip_warmup,
dataset_type=dataset_type)
# Blending dataset.
# Parse the values.
output = get_datasets_weights_and_num_samples(data_prefix,
train_valid_test_num_samples)
prefixes, weights, datasets_train_valid_test_num_samples = output
# Build individual datasets.
train_datasets = []
valid_datasets = []
test_datasets = []
for i in range(len(prefixes)):
train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
prefixes[i], data_impl, splits_string,
datasets_train_valid_test_num_samples[i],
max_seq_length, masked_lm_prob, short_seq_prob,
seed, skip_warmup, dataset_type=dataset_type)
# Blend.
blending_train_dataset = BlendableDataset(train_datasets, weights)
blending_valid_dataset = BlendableDataset(valid_datasets, weights)
blending_test_dataset = BlendableDataset(test_datasets, weights)
return (blending_train_dataset, blending_valid_dataset,
blending_test_dataset)
def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
train_valid_test_num_samples,
max_seq_length, masked_lm_prob,
short_seq_prob, seed, skip_warmup,
dataset_type='standard_bert'):
if dataset_type not in DSET_TYPES:
raise ValueError("Invalid dataset_type: ", dataset_type)
......
......@@ -22,6 +22,8 @@ import numpy as np
import torch
from megatron import mpu, print_rank_0
from megatron.data.blendable_dataset import BlendableDataset
from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
from megatron.data.dataset_utils import get_train_valid_test_split_
from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
......@@ -31,6 +33,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
seq_length, seed, skip_warmup):
"""Build train, valid, and test datasets."""
# Single dataset.
if len(data_prefix) == 1:
return _build_train_valid_test_datasets(data_prefix[0],
data_impl, splits_string,
train_valid_test_num_samples,
seq_length, seed, skip_warmup)
# Blending dataset.
# Parse the values.
output = get_datasets_weights_and_num_samples(data_prefix,
train_valid_test_num_samples)
prefixes, weights, datasets_train_valid_test_num_samples = output
# Build individual datasets.
train_datasets = []
valid_datasets = []
test_datasets = []
for i in range(len(prefixes)):
train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
prefixes[i], data_impl, splits_string,
datasets_train_valid_test_num_samples[i],
seq_length, seed, skip_warmup)
train_datasets.append(train_ds)
valid_datasets.append(valid_ds)
test_datasets.append(test_ds)
# Blend.
blending_train_dataset = BlendableDataset(train_datasets, weights)
blending_valid_dataset = BlendableDataset(valid_datasets, weights)
blending_test_dataset = BlendableDataset(test_datasets, weights)
return (blending_train_dataset, blending_valid_dataset,
blending_test_dataset)
def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
train_valid_test_num_samples,
seq_length, seed, skip_warmup):
"""Build train, valid, and test datasets."""
# Indexed dataset.
indexed_dataset = get_indexed_dataset_(data_prefix,
data_impl,
......
......@@ -33,6 +33,69 @@ using namespace std;
const int32_t LONG_SENTENCE_LEN = 512;
void build_blending_indices(py::array_t<uint8_t>& dataset_index,
py::array_t<int64_t>& dataset_sample_index,
const py::array_t<double>& weights,
const int32_t num_datasets,
const int64_t size, const bool verbose) {
/* Given multiple datasets and a weighting array, build samples
such that it follows those wieghts.*/
if (verbose) {
std::cout << "> building indices for blendable datasets ..." << std::endl;
}
// Get the pointer access without the checks.
auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
auto weights_ptr = weights.unchecked<1>();
// Initialize buffer for number of samples used for each dataset.
int64_t current_samples[num_datasets];
for(int64_t i = 0; i < num_datasets; ++i) {
current_samples[i] = 0;
}
// For each sample:
for(int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
// Determine where the max error in sampling is happening.
auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
int64_t max_error_index = 0;
double max_error = weights_ptr[0] * sample_idx_double -
static_cast<double>(current_samples[0]);
for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
double error = weights_ptr[dataset_idx] * sample_idx_double -
static_cast<double>(current_samples[dataset_idx]);
if (error > max_error) {
max_error = error;
max_error_index = dataset_idx;
}
}
// Populate the indices.
dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
// Update the total samples.
current_samples[max_error_index] += 1;
}
// print info
if (verbose) {
std::cout << " > sample ratios:" << std::endl;
for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
auto ratio = static_cast<double>(current_samples[dataset_idx]) /
static_cast<double>(size);
std::cout << " dataset " << dataset_idx << ", input: " <<
weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
}
}
}
py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
const py::array_t<int32_t>& doc_idx_,
const int32_t seq_length,
......@@ -640,4 +703,5 @@ PYBIND11_MODULE(helpers, m) {
m.def("build_mapping", &build_mapping);
m.def("build_blocks_mapping", &build_blocks_mapping);
m.def("build_sample_idx", &build_sample_idx);
m.def("build_blending_indices", &build_blending_indices);
}
......@@ -6,7 +6,6 @@ import torch
from megatron import mpu, print_rank_0
from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
from megatron.data.samplers import DistributedBatchSampler
from megatron import get_args, get_tokenizer, print_rank_0, mpu
......@@ -23,6 +22,8 @@ def get_one_epoch_dataloader(dataset, batch_size=None):
sampler = torch.utils.data.SequentialSampler(dataset)
# importantly, drop_last must be False to get all the data.
assert False, 'DistributedBatchSampler deprecated, change the implementation'
from megatron.data.samplers import DistributedBatchSampler
batch_sampler = DistributedBatchSampler(sampler,
batch_size=global_batch_size,
drop_last=False,
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Batch samplers that work with either random or sequential data samplers."""
import torch
from torch.utils import data
class RandomSampler(data.sampler.Sampler):
"""Based off of pytorch RandomSampler and DistributedSampler. Essentially
a RandomSampler, but this class lets the user set an epoch like
DistributedSampler Samples elements randomly. If without replacement, then
sample from a shuffled dataset. If with replacement, then user can
specify ``num_samples`` to draw.
Arguments:
data_source (Dataset): dataset to sample from
num_samples (int): number of samples to draw, default=len(dataset)
replacement (bool): samples are drawn with replacement if ``True``,
default=False
"""
def __init__(self, data_source, replacement=False, num_samples=None):
self.data_source = data_source
self.replacement = replacement
self._num_samples = num_samples
self.epoch = -1
if self._num_samples is not None and replacement is False:
raise ValueError("With replacement=False, num_samples should not "
"be specified, since a random permute will be "
"performed.")
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(
self.num_samples))
if not isinstance(self.replacement, bool):
raise ValueError("replacement should be a boolean value, but got "
"replacement={}".format(self.replacement))
@property
def num_samples(self):
# dataset size might change at runtime
if self._num_samples is None:
return len(self.data_source)
return self._num_samples
def __iter__(self):
n = len(self.data_source)
g = torch.Generator()
if self.epoch >= 0:
g.manual_seed(self.epoch)
if self.replacement:
return iter(torch.randint(high=n, size=(self.num_samples,),
dtype=torch.int64, generator=g).tolist())
return iter(torch.randperm(n, generator=g).tolist())
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
class DistributedBatchSampler(data.sampler.BatchSampler):
"""Similar to normal implementation of distributed sampler, except
implementation is at the batch sampler level, instead of just the
sampler level. This allows wrapping of arbitrary data samplers
(sequential, random, WeightedRandomSampler, etc.) with this batch
sampler.
The `interleave` argument specifies how to distribute a batch. A value
of True combined with the above random sampler is equivalent to pytorch's
torch.utils.data.distributed.DistributedSampler.
For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
specifying True will result in the following samples for each gpu:
GPU0: [0,2,4,6] GPU1: [1,3,5,7]
specifying False will result in the following samples:
GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
def __init__(self, sampler, batch_size, drop_last, rank=-1,
world_size=2, wrap_last=False, interleave=False):
super(DistributedBatchSampler, self).__init__(sampler, batch_size,
drop_last)
if rank == -1:
assert False, 'should not be here'
rank = torch.distributed.get_rank()
self.rank = rank
self.world_size = world_size
self.sampler.wrap_around = 0
self.wrap_around = 0
self.wrap_last = wrap_last
self.start_iter = 0
self.interleave = interleave
def __iter__(self):
batch = []
i = 0
for idx in self.data_iterator(self.sampler, wrap_around=False):
batch.append(idx)
if len(batch) == self.batch_size:
tbatch = self._batch(batch)
if i >= self.start_iter:
yield tbatch
self.start_iter = 0
i += 1
batch = []
batch_len = len(batch)
if batch_len > 0 and not self.drop_last:
if self.wrap_last:
self.sampler.wrap_around -= (self.batch_size)
self.wrap_around += (len(batch))
self.wrap_around %= self.batch_size
yield self._batch(batch)
if self.wrap_last:
self.sampler.wrap_around += self.batch_size
def data_iterator(self, _iter, wrap_around=False):
"""iterates through data and handles wrap around"""
for i, idx in enumerate(_iter):
if i < self.wrap_around % self.batch_size:
continue
if wrap_around:
self.wrap_around += 1
self.wrap_around %= self.batch_size
yield idx
def _batch(self, batch):
"""extracts samples only pertaining to this worker's batch"""
if self.interleave:
return batch[self.rank:self.batch_size:self.world_size]
start = self.rank * self.batch_size // self.world_size
end = (self.rank + 1) * self.batch_size // self.world_size
return batch[start:end]
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for creating datasets"""
import os
import math
import torch
from .samplers import DistributedBatchSampler
from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
from . import corpora
TRAIN_DATA = 0
VAL_DATA = 1
TEST_DATA = 2
def should_split(split):
"""
given split proportions checks if should split
Examples:
>>> should_split([10,0,0])
False
>>> should_split([1,.1,.2])
True
"""
return max(split) / sum(split) != 1.
def get_ext(path):
"""gets path extension"""
return os.path.splitext(path)[1]
def get_dataset(path, **kwargs):
"""gets dataset object based on keyword args and file at `path`"""
if supported_corpus(path):
return corpora.NAMED_CORPORA[path](**kwargs)
ext = get_ext(path)
if '.json' in ext:
text = json_dataset(path, **kwargs)
elif ext in ['.csv', '.tsv']:
text = csv_dataset(path, **kwargs)
else:
raise NotImplementedError('data file type %s is not supported' % (ext))
return text
def supported_corpus(corpus_name):
"""checks if corpus name is defined in `corpora.py`"""
return corpus_name in corpora.NAMED_CORPORA
def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
parallel_group=None, **kwargs):
"""function to create datasets+tokenizers for common options"""
if isinstance(process_fn, str):
process_fn = eval(process_fn)
if non_binary_cols is not None:
# multilabel dataset support (only for csvs)
label_key = non_binary_cols
def get_dataset_from_path(path_):
if lazy:
# get lazily loaded dataset
named_corpora = False
if supported_corpus(path_):
named_corpora = True
name = path_
path_ = corpora.NAMED_CORPORA[path_].PATH
if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
# create cached version of dataset for lazy loading if it doesn't exist
text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
make_lazy(path_, text.X, data_type='data')
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts = torch.cuda.LongTensor([1])
torch.distributed.all_reduce(counts, group=parallel_group)
assert counts[0].item() == torch.distributed.get_world_size(
group=parallel_group)
text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
else:
# get dataset
text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
return text
# get one or multiple datasets and concatenate
if isinstance(path, str):
path = [path]
datasets = [get_dataset_from_path(p) for p in path]
if len(datasets) == 1:
ds = datasets[0]
else:
ds = ConcatDataset(datasets)
# make tokenizer for dataset
if tokenizer is None:
tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
pad_token, character_converage, **kwargs)
ds_type = ''
if 'ds_type' in kwargs:
ds_type = kwargs['ds_type']
ds.SetTokenizer(tokenizer)
# Split dataset into train/val/test (and wrap bert dataset)
if should_split(split):
ds = split_ds(ds, split)
if 'bert' in ds_type.lower():
presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
dstype = bert_sentencepair_dataset
ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
if d is not None else None for d in ds]
elif ds_type.lower() == 'gpt2':
ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
else:
if 'bert' in ds_type.lower():
presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
dstype = bert_sentencepair_dataset
ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
elif ds_type.lower() == 'gpt2':
ds = GPT2Dataset(ds, max_seq_len=seq_length)
return ds, tokenizer
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""parses arguments and preps data loader"""
import copy
import torch
from megatron import data_utils
from megatron import mpu
class DataConfig:
def __init__(self, defaults={}):
super(DataConfig, self).__init__()
self.defaults = defaults
def apply(self, args):
if torch.distributed.get_rank() == 0:
print('configuring data')
self.apply_defaults(args)
return make_loaders(args)
def set_defaults(self, **kwargs):
for k, v in kwargs.items():
self.defaults[k] = v
def apply_defaults(self, args):
for k, v in self.defaults.items():
k = k.replace('-', '_')
if not hasattr(args, k):
setattr(args, k, v)
def make_data_loader(dataset, batch_size, args):
shuffle = args.shuffle
if shuffle:
sampler = data_utils.samplers.RandomSampler(
dataset, replacement=True, num_samples=batch_size * args.train_iters)
else:
sampler = torch.utils.data.SequentialSampler(dataset)
world_size = torch.distributed.get_world_size(
group=mpu.get_data_parallel_group())
rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
distributed = world_size > 1
drop_last = distributed
if distributed:
batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler,
batch_size,
drop_last,
rank,
world_size)
else:
batch_sampler = torch.utils.data.BatchSampler(sampler,
batch_size,
drop_last)
data_loader = torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=args.num_workers,
pin_memory=True)
return data_loader
def make_tfrecord_loaders(args):
"""Load train/val/test dataset from shuffled TFRecords"""
import data_utils.tf_dl
data_set_args = {'batch_size': args.batch_size,
'max_seq_len': args.seq_length,
'max_preds_per_seq': args.max_preds_per_seq,
'train': True,
'num_workers': max(args.num_workers, 1),
'seed': args.seed + args.rank + 1,
'threaded_dl': args.num_workers > 0
}
train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
**data_set_args)
data_set_args['train'] = False
if args.eval_seq_length is not None:
data_set_args['max_seq_len'] = args.eval_seq_length
if args.eval_max_preds_per_seq is not None:
data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
valid = None
if args.valid_data is not None:
valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
**data_set_args)
test = None
if args.test_data is not None:
test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
**data_set_args)
tokenizer = data_utils.make_tokenizer(args.tokenizer_type,
train,
args.tokenizer_path,
args.vocab_size,
args.tokenizer_model_type,
cache_dir=args.cache_dir)
return (train, valid, test), tokenizer
def make_loaders(args):
"""makes training/val/test"""
if args.data_loader == 'tfrecords':
return make_tfrecord_loaders(args)
world_size = torch.distributed.get_world_size(
group=mpu.get_data_parallel_group())
batch_size = args.batch_size * world_size
eval_batch_size = batch_size
if args.eval_batch_size is not None:
eval_batch_size = args.eval_batch_size * world_size
seq_length = args.seq_length
if seq_length < 0:
seq_length = seq_length * world_size
eval_seq_length = args.eval_seq_length
if eval_seq_length is not None and eval_seq_length < 0:
eval_seq_length = eval_seq_length * world_size
split = get_split(args)
if args.data_path is not None:
args.train_data = args.data_path
data_set_args = {
'path': args.train_data,
'seq_length': seq_length,
'lazy': args.data_loader == 'lazy',
'delim': args.delim,
'text_key': args.text_key,
'label_key': 'label',
'non_binary_cols': None,
'ds_type': args.data_set_type,
'split': split,
'loose': args.loose_json,
'tokenizer_type': args.tokenizer_type,
'tokenizer_model_path': args.tokenizer_path,
'vocab_size': args.vocab_size,
'model_type': args.tokenizer_model_type,
'cache_dir': args.cache_dir,
'max_preds_per_seq': args.max_preds_per_seq,
'presplit_sentences': args.presplit_sentences,
'parallel_group': mpu.get_data_parallel_group()}
eval_set_args = copy.copy(data_set_args)
eval_set_args['split'] = [1.]
# if optional eval args were set then replace their
# equivalent values in the arg dict
if eval_seq_length:
eval_set_args['seq_length'] = eval_seq_length
if args.eval_max_preds_per_seq:
eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
if args.eval_text_key is not None:
eval_set_args['text_key'] = args.eval_text_key
# make datasets splits and tokenizer
train = None
valid = None
test = None
if args.train_data is not None:
train, tokenizer = data_utils.make_dataset(**data_set_args)
if data_utils.should_split(split):
train, valid, test = train
eval_set_args['tokenizer'] = tokenizer
# make training and val dataset if necessary
if valid is None and args.valid_data is not None:
eval_set_args['path'] = args.valid_data
valid, tokenizer = data_utils.make_dataset(**eval_set_args)
eval_set_args['tokenizer'] = tokenizer
if test is None and args.test_data is not None:
eval_set_args['path'] = args.test_data
test, tokenizer = data_utils.make_dataset(**eval_set_args)
# wrap datasets with data loader
if train is not None and args.batch_size > 0:
train = make_data_loader(train, batch_size, args)
args.do_train = True
else:
args.do_train = False
eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
if valid is not None:
valid = make_data_loader(valid, eval_batch_size, args)
args.do_valid = True
else:
args.do_valid = False
if test is not None:
test = make_data_loader(test, eval_batch_size, args)
args.do_test = True
else:
args.do_test = False
return (train, valid, test), tokenizer
def get_split(args):
"""
Get dataset splits from comma separated string list
"""
splits = []
if args.split.find(',') != -1:
splits = [float(s) for s in args.split.split(',')]
elif args.split.find('/') != -1:
splits = [float(s) for s in args.split.split('/')]
else:
splits = [float(args.split)]
split_total = sum(splits)
if split_total < 1.:
splits.append(1 - split_total)
while len(splits) < 3:
splits.append(0.)
splits = splits[:3]
if args.valid_data is not None:
splits[1] = 0.
if args.test_data is not None:
splits[2] = 0.
final_sum = sum(splits)
return [s / final_sum for s in splits]
def configure_data():
"""add cmdline flags for configuring datasets"""
# These are options that are used by data_utils, but are either
# deprecated or not meant to be exposed to the command line user.
# These options are intneded to be set in code by specific scripts.
defaults = {
'world_size': 1,
'rank': -1,
'persist_state': 0,
'lazy': False,
'transpose': False,
'data_set_type': 'supervised',
'seq_length': 256,
'eval_seq_length': 256,
'samples_per_shard': 100
}
return DataConfig(defaults=defaults)
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""several datasets with preset arguments"""
from .datasets import json_dataset, csv_dataset
import os
class wikipedia(json_dataset):
"""
dataset for wikipedia with arguments configured for convenience
command line usage: `--train-data wikipedia`
"""
PATH = 'data/wikipedia/wikidump_lines.json'
assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
def __init__(self, **kwargs):
assert os.path.exists(wikipedia.PATH), \
wikipedia.assert_str
if not kwargs:
kwargs = {}
kwargs['text_key'] = 'text'
kwargs['loose_json'] = True
super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
class webtext(json_dataset):
"""
dataset for webtext with arguments configured for convenience
command line usage: `--train-data webtext`
"""
PATH = 'data/webtext/data.json'
assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
def __init__(self, **kwargs):
assert os.path.exists(webtext.PATH), \
webtext.assert_str
if not kwargs:
kwargs = {}
kwargs['text_key'] = 'text'
kwargs['loose_json'] = True
super(webtext, self).__init__(webtext.PATH, **kwargs)
NAMED_CORPORA = {
'wikipedia': wikipedia,
'webtext': webtext,
}
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""dataset objects for jsons, csvs, and BERT datasets"""
import os
import time
from operator import itemgetter
from bisect import bisect_right
import json
import csv
import math
import random
from itertools import accumulate
from torch.utils import data
import pandas as pd
import numpy as np
import nltk
from nltk import tokenize
from .lazy_loader import lazy_array_loader, exists_lazy, make_lazy
from .tokenization import Tokenization
class ConcatDataset(data.Dataset):
"""
Dataset to concatenate multiple datasets.
Purpose: useful to assemble different existing datasets, possibly
large-scale datasets as the concatenation operation is done in an
on-the-fly manner.
Arguments:
datasets (sequence): List of datasets to be concatenated.
"""
@staticmethod
def cumsum(sequence):
r, s = [], 0
for e in sequence:
l = len(e)
r.append(l + s)
s += l
return r
def __init__(self, datasets, **kwargs):
super(ConcatDataset, self).__init__()
assert len(datasets) > 0, 'datasets should not be an empty iterable'
self.datasets = list(datasets)
self.is_lazy = sum([isinstance(ds, lazy_array_loader)
for ds in self.datasets]) == len(self.datasets)
self.cumulative_sizes = self.cumsum(self.datasets)
self._X = None
self._Y = None
self._lens = None
def SetTokenizer(self, tokenizer):
for ds in self.datasets:
ds.SetTokenizer(tokenizer)
def GetTokenizer(self):
return self.datasets[0].GetTokenizer()
def __len__(self):
return self.cumulative_sizes[-1]
def __getitem__(self, idx):
dataset_idx = bisect_right(self.cumulative_sizes, idx)
if dataset_idx == 0:
sample_idx = idx
else:
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
return self.datasets[dataset_idx][sample_idx]
@property
def lens(self):
if self._lens is None:
self._lens = []
if self.is_lazy:
for data in self.datasets:
self._lens.extend(data.lens)
else:
for data in self.datasets:
self._lens.extend([len(d['text']) if isinstance(
d, dict) else len(d) for d in data])
return self._lens
@property
def X(self):
if self._X is None:
self._X = []
for data in self.datasets:
self._X.extend(data.X)
return self._X
@property
def Y(self):
if self._Y is None:
self._Y = []
for data in self.datasets:
self._Y.extend(list(data.Y))
self._Y = np.array(self._Y)
return self._Y
@property
def cummulative_sizes(self):
warnings.warn("cummulative_sizes attribute is renamed to "
"cumulative_sizes", DeprecationWarning, stacklevel=2)
return self.cumulative_sizes
class SplitDataset(data.Dataset):
"""
Dataset wrapper to access a subset of another dataset.
Purpose: useful to index into existing datasets, possibly
large-scale datasets as the subindexing operation is done in an
on-the-fly manner.
Arguments:
ds (Dataset or array-like): List of datasets to be subindexed
split_inds (1D array-like): List of indices part of subset
"""
def __init__(self, ds, split_inds, **kwargs):
self.split_inds = list(split_inds)
self.wrapped_data = ds
self.is_lazy = isinstance(ds, lazy_array_loader) or (hasattr(ds, 'is_lazy') and ds.is_lazy)
if self.is_lazy:
self.lens = itemgetter(*self.split_inds)(list(self.wrapped_data.lens))
self._X = None
self._Y = None
def __len__(self):
return len(self.split_inds)
def __getitem__(self, index):
return self.wrapped_data[self.split_inds[index]]
def SetTokenizer(self, tokenizer):
self.wrapped_data.SetTokenizer(tokenizer)
def GetTokenizer(self):
return self.wrapped_data.GetTokenizer()
@property
def X(self):
if self._X is None:
self._X = itemgetter(*self.split_inds)(self.wrapped_data.X)
return self._X
@property
def Y(self):
if self._Y is None:
self._Y = np.array(itemgetter(*self.split_inds)(self.wrapped_data.Y))
return self._Y
def __iter__(self):
for idx in self.split_inds:
yield self.wrapped_data[idx]
def split_ds(ds, split=[.8, .2, .0], shuffle=True):
"""
Split a dataset into subsets given proportions of how
much to allocate per split. If a split is 0% returns None for that split.
Purpose: Useful for creating train/val/test splits
Arguments:
ds (Dataset or array-like): Data to be split.
split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
shuffle (boolean): Randomly split dataset. Default: True
"""
split_sum = sum(split)
if split_sum == 0:
raise Exception('Split cannot sum to 0.')
split = np.array(split)
split /= split_sum
ds_len = len(ds)
inds = np.arange(ds_len)
if shuffle:
np.random.shuffle(inds)
start_idx = 0
residual_idx = 0
rtn_ds = [None] * len(split)
for i, f in enumerate(split):
if f != 0:
proportion = ds_len * split[i]
residual_idx += proportion % 1
split_ = int(int(proportion) + residual_idx)
split_inds = inds[start_idx:start_idx + max(split_, 1)]
rtn_ds[i] = SplitDataset(ds, split_inds)
start_idx += split_
residual_idx %= 1
return rtn_ds
class csv_dataset(data.Dataset):
"""
Class for loading datasets from csv files.
Purpose: Useful for loading data for unsupervised modeling or transfer tasks
Arguments:
path (str): Path to csv file with dataset.
tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
preprocess_fn (callable): Callable that process a string into desired format.
delim (str): delimiter for csv. Default: ','
binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False
drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty
columns with -1 (regardless if rows are dropped based on value) Default: False
text_key (str): key to get text from csv. Default: 'sentence'
label_key (str): key to get label from json dictionary. Default: 'label'
Attributes:
X (list): all strings from the csv file
Y (np.ndarray): labels to train with
"""
def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
**kwargs):
self.is_lazy = False
self.preprocess_fn = preprocess_fn
self.SetTokenizer(tokenizer)
self.path = path
self.delim = delim
self.text_key = text_key
self.label_key = label_key
self.drop_unlabeled = drop_unlabeled
if '.tsv' in self.path:
self.delim = '\t'
self.X = []
self.Y = []
try:
cols = [text_key]
if isinstance(label_key, list):
cols += label_key
else:
cols += [label_key]
data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1')
except BaseException:
data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1')
data = data.dropna(axis=0)
self.X = data[text_key].values.tolist()
try:
self.Y = data[label_key].values
except Exception as e:
self.Y = np.ones(len(self.X)) * -1
if binarize_sent:
self.Y = binarize_labels(self.Y, hard=binarize_sent)
def SetTokenizer(self, tokenizer):
if tokenizer is None:
self.using_tokenizer = False
if not hasattr(self, '_tokenizer'):
self._tokenizer = tokenizer
else:
self.using_tokenizer = True
self._tokenizer = tokenizer
def GetTokenizer(self):
return self._tokenizer
@property
def tokenizer(self):
if self.using_tokenizer:
return self._tokenizer
return None
def __len__(self):
return len(self.X)
def __getitem__(self, index):
"""process+tokenize string and return string,label,and stringlen"""
x = self.X[index]
if self.tokenizer is not None:
x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
elif self.preprocess_fn is not None:
x = self.preprocess_fn(x)
y = self.Y[index]
if isinstance(y, str):
if self.tokenizer is not None:
y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
elif self.preprocess_fn is not None:
y = self.preprocess_fn(y)
return {'text': x, 'length': len(x), 'label': y}
def write(self, writer_gen=None, path=None, skip_header=False):
"""
given a generator of metrics for each of the data points X_i,
write the metrics, text, and labels to a csv file
"""
if path is None:
path = self.path + '.results'
print('generating csv at ' + path)
with open(path, 'w') as csvfile:
c = csv.writer(csvfile, delimiter=self.delim)
if writer_gen is not None:
# if first item of generator is a header of what the metrics mean then
# write header to csv file
if not skip_header:
header = (self.label_key,) + tuple(next(writer_gen)) + (self.text_key,)
c.writerow(header)
for i, row in enumerate(writer_gen):
row = (self.Y[i],) + tuple(row) + (self.X[i],)
c.writerow(row)
else:
c.writerow([self.label_key, self.text_key])
for row in zip(self.Y, self.X):
c.writerow(row)
class json_dataset(data.Dataset):
"""
Class for loading datasets from a json dump.
Purpose: Useful for loading data for unsupervised modeling or transfer tasks
Arguments:
path (str): path to json file with dataset.
tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
preprocess_fn (callable): callable function that process a string into desired format.
Takes string, maxlen=None, encode=None as arguments. Default: process_str
text_key (str): key to get text from json dictionary. Default: 'sentence'
label_key (str): key to get label from json dictionary. Default: 'label'
Attributes:
all_strs (list): list of all strings from the dataset
all_labels (list): list of all labels from the dataset (if they have it)
"""
def __init__(self, path, tokenizer=None, preprocess_fn=None, binarize_sent=False,
text_key='sentence', label_key='label', loose_json=False, **kwargs):
self.is_lazy = False
self.preprocess_fn = preprocess_fn
self.path = path
self.SetTokenizer(tokenizer)
self.X = []
self.Y = []
self.text_key = text_key
self.label_key = label_key
self.loose_json = loose_json
for j in self.load_json_stream(self.path):
s = j[text_key]
self.X.append(s)
self.Y.append(j[label_key])
if binarize_sent:
self.Y = binarize_labels(self.Y, hard=binarize_sent)
def SetTokenizer(self, tokenizer):
if tokenizer is None:
self.using_tokenizer = False
if not hasattr(self, '_tokenizer'):
self._tokenizer = tokenizer
else:
self.using_tokenizer = True
self._tokenizer = tokenizer
def GetTokenizer(self):
return self._tokenizer
@property
def tokenizer(self):
if self.using_tokenizer:
return self._tokenizer
return None
def __getitem__(self, index):
"""gets the index'th string from the dataset"""
x = self.X[index]
if self.tokenizer is not None:
x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
elif self.preprocess_fn is not None:
x = self.preprocess_fn(x)
y = self.Y[index]
if isinstance(y, str):
if self.tokenizer is not None:
y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
elif self.preprocess_fn is not None:
y = self.preprocess_fn(y)
return {'text': x, 'length': len(x), 'label': y}
def __len__(self):
return len(self.X)
def write(self, writer_gen=None, path=None, skip_header=False):
"""
given a generator of metrics for each of the data points X_i,
write the metrics, text, and labels to a json file
"""
if path is None:
path = self.path + '.results'
jsons = []
if writer_gen is not None:
# if first item of generator is a header of what the metrics mean then
# write header to csv file
def gen_helper():
keys = {}
keys[0] = self.label_key
if not skip_header:
for idx, k in enumerate(tuple(next(writer_gen))):
keys[idx + 1] = k
for i, row in enumerate(writer_gen):
if i == 0 and skip_header:
for idx, _ in enumerate(row):
keys[idx + 1] = 'metric_%d' % (idx,)
j = {}
for idx, v in enumerate((self.Y[i],) + tuple(row)):
k = keys[idx]
j[k] = v
yield j
else:
def gen_helper():
for y in self.Y:
j = {}
j[self.label_key] = y
yield j
def out_stream():
for i, j in enumerate(gen_helper()):
j[self.text_key] = self.X[i]
yield j
self.save_json_stream(path, out_stream())
def save_json_stream(self, save_path, json_stream):
if self.loose_json:
with open(save_path, 'w') as f:
for i, j in enumerate(json_stream):
write_string = ''
if i != 0:
write_string = '\n'
write_string += json.dumps(j)
f.write(write_string)
else:
jsons = [j for j in json_stream]
json.dump(jsons, open(save_path, 'w'), separators=(',', ':'))
def load_json_stream(self, load_path):
if not self.loose_json:
jsons = json.load(open(load_path, 'r'))
generator = iter(jsons)
else:
def gen_helper():
with open(load_path, 'r') as f:
for row in f:
yield json.loads(row)
generator = gen_helper()
for j in generator:
if self.label_key not in j:
j[self.label_key] = -1
yield j
class GPT2Dataset(data.Dataset):
def __init__(self, ds,
max_seq_len=1024,
num_samples=None,
weighted=True,
sample_across_doc=True,
random_across_doc_sampling=True,
bias_for_single_doc=False,
sentence_start=False, **kwargs):
self.ds = ds
self.ds_len = len(self.ds)
self.num_samples = num_samples
if num_samples is None:
self.num_samples = 1000 * self.ds_len
self.max_seq_len = max_seq_len
self.tokenizer = self.ds.GetTokenizer()
self.ds.SetTokenizer(None)
self.weighted = weighted
self.sample_across_doc = sample_across_doc
self.random_across_doc_sampling = random_across_doc_sampling
self.bias_for_single_doc = bias_for_single_doc
self.sentence_start = sentence_start
self.init_weighting()
def init_weighting(self):
if self.weighted:
if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
lens = np.array(self.ds.lens)
else:
lens = np.array([len(d['text']) if isinstance(d, dict)
else len(d) for d in self.ds])
self.total_len = np.sum(lens)
self.weighting = list(accumulate(lens))
else:
self.weighting = None
def get_weighted_samples(self, np_rng):
if self.weighting is not None:
idx = np_rng.randint(self.total_len)
return bisect_right(self.weighting, idx)
else:
return np_rng.randint(self.ds_len)
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
# init rng
rng = random.Random(idx)
rng = np.random.RandomState(seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
# get possibly weighted random index from dataset
data_idx = self.get_weighted_samples(rng)
# data_idx = rng.choice(self.ds_len, p=self.weighting)
tokens = self.getidx(data_idx)
# truncate or pad tokens
num_tokens = len(tokens)
if self.bias_for_single_doc:
tokens_to_strip = num_tokens - self.max_seq_len - 1
else:
tokens_to_strip = num_tokens - 1
if tokens_to_strip > 0:
strip_left_tokens = rng.randint(tokens_to_strip + 1)
tokens = tokens[strip_left_tokens:]
if self.sentence_start:
token_copy = list(tokens)
not_done = True
while (len(token_copy) > 0) and not_done:
tok = token_copy.pop(0)
if self.contains_sentence_end(tok):
tokens = token_copy
not_done = False
strip_right_rokens = len(tokens) - self.max_seq_len - 1
if strip_right_rokens > 0:
tokens = tokens[:-strip_right_rokens]
if self.sample_across_doc:
while (len(tokens) < (self.max_seq_len + 1)):
if self.random_across_doc_sampling:
data_idx = self.get_weighted_samples(rng)
else:
data_idx = (data_idx + 1) % self.ds_len
tokens += self.getidx(data_idx)
tokens = tokens[:(self.max_seq_len + 1)]
tokens = self.pad_seq(tokens)
return {'text': np.array(tokens), }
def getidx(self, data_idx):
data = self.ds[data_idx]
if isinstance(data, dict):
data = data['text']
# tokenize
tokenization = self.tokenizer.EncodeAsIds(data)
tokenization.append(self.tokenizer.get_command('eos'))
tokens = tokenization.tokenization
return tokens
def pad_seq(self, seq):
total_tokens = self.max_seq_len + 1
num_pad_tokens = max(0, total_tokens - len(seq))
seq += [self.tokenizer.get_command('pad').Id] * (num_pad_tokens)
return seq
def contains_sentence_end(self, tok):
tok = self.tokenizer.IdToToken(tok)
if '.' in tok:
return True
if '?' in tok:
return True
if '!' in tok:
return True
return False
class bert_sentencepair_dataset(data.Dataset):
"""
Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
Arguments:
ds (Dataset or array-like): data corpus to use for training
max_seq_len (int): maximum sequence length to use for a sentence pair
mask_lm_prob (float): proportion of tokens to mask for masked LM
max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10
short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len
dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)
"""
def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None,
short_seq_prob=.01, dataset_size=None, presplit_sentences=False, weighted=True, **kwargs):
self.ds = ds
self.ds_len = len(self.ds)
self.tokenizer = self.ds.GetTokenizer()
self.vocab_words = list(self.tokenizer.text_token_vocab.values())
self.ds.SetTokenizer(None)
self.max_seq_len = max_seq_len
self.mask_lm_prob = mask_lm_prob
if max_preds_per_seq is None:
max_preds_per_seq = math.ceil(max_seq_len * mask_lm_prob / 10) * 10
self.max_preds_per_seq = max_preds_per_seq
self.short_seq_prob = short_seq_prob
self.dataset_size = dataset_size
if self.dataset_size is None:
self.dataset_size = self.ds_len * (self.ds_len - 1)
self.presplit_sentences = presplit_sentences
if not self.presplit_sentences:
nltk.download('punkt', download_dir="./nltk")
self.weighted = weighted
self.get_weighting()
def get_weighting(self):
if self.weighted:
if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
lens = np.array(self.ds.lens)
else:
lens = np.array([len(d['text']) if isinstance(d, dict) else len(d)
for d in self.ds])
self.total_len = np.sum(lens)
self.weighting = list(accumulate(lens))
else:
self.weighting = None
def get_weighted_samples(self, np_rng):
if self.weighting is not None:
idx = np_rng.randint(self.total_len)
return bisect_right(self.weighting, idx)
else:
return np_rng.randint(self.ds_len)
def __len__(self):
return self.dataset_size
def __getitem__(self, idx):
# get rng state corresponding to index (allows deterministic random pair)
rng = random.Random(idx)
np_rng = np.random.RandomState(seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
# get seq length
target_seq_length = self.max_seq_len
short_seq = False
if rng.random() < self.short_seq_prob:
target_seq_length = rng.randint(2, target_seq_length)
short_seq = True
# get sentence pair and label
is_random_next = None
lena = 0
lenb = 0
while (is_random_next is None) or (lena < 1) or (lenb < 1):
tokensa, tokensb, is_random_next = self.create_random_sentencepair(
target_seq_length, rng, np_rng)
lena = len(tokensa[0])
lenb = len(tokensb[0])
# truncate sentence pair to max_seq_len
tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb, self.max_seq_len, rng)
# join sentence pair, mask, and pad
tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(
tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq, self.vocab_words, rng)
sample = {
'text': np.array(
tokens[0]),
'types': np.array(
tokens[1]),
'is_random': int(is_random_next),
'mask': np.array(mask),
'mask_labels': np.array(mask_labels),
'pad_mask': np.array(pad_mask)}
return sample
def sentence_split(self, document):
"""split document into sentences"""
lines = document.split('\n')
if self.presplit_sentences:
return [line for line in lines if line]
rtn = []
for line in lines:
if line != '':
rtn.extend(tokenize.sent_tokenize(line))
return rtn
def sentence_tokenize(self, sent, sentence_num=0, beginning=False, ending=False):
"""tokenize sentence and get token types"""
tokens = self.tokenizer.EncodeAsIds(sent).tokenization
str_type = 'str' + str(sentence_num)
token_types = [self.tokenizer.get_type(str_type).Id] * len(tokens)
return tokens, token_types
def get_doc(self, idx):
"""gets text of document corresponding to idx"""
rtn = self.ds[idx]
if isinstance(rtn, dict):
rtn = rtn['text']
return rtn
def create_random_sentencepair(self, target_seq_length, rng, np_rng):
"""
fetches a random sentencepair corresponding to rng state similar to
https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
"""
is_random_next = None
curr_strs = []
curr_str_types = []
curr_len = 0
while curr_len < 1:
curr_len = 0
doc_a = None
while doc_a is None:
if self.weighted:
# doc_a_idx = np_rng.choice(self.ds_len, p=self.weighting)
doc_a_idx = self.get_weighted_samples(np_rng)
else:
doc_a_idx = rng.randint(0, self.ds_len - 1)
doc_a = self.sentence_split(self.get_doc(doc_a_idx))
if not doc_a:
doc_a = None
random_start_a = rng.randint(0, len(doc_a) - 1)
while random_start_a < len(doc_a):
sentence = doc_a[random_start_a]
sentence, sentence_types = self.sentence_tokenize(
sentence, 0, random_start_a == 0, random_start_a == len(doc_a))
curr_strs.append(sentence)
curr_str_types.append(sentence_types)
curr_len += len(sentence)
if random_start_a == len(doc_a) - 1 or curr_len >= target_seq_length:
break
random_start_a = (random_start_a + 1)
if curr_strs:
num_a = 1
if len(curr_strs) >= 2:
num_a = rng.randint(0, len(curr_strs))
tokens_a = []
token_types_a = []
for j in range(num_a):
tokens_a.extend(curr_strs[j])
token_types_a.extend(curr_str_types[j])
tokens_b = []
token_types_b = []
is_random_next = False
if len(curr_strs) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
b_len = 0
while b_len < 1:
doc_b = None
while doc_b is None:
doc_b_idx = rng.randint(0, self.ds_len - 2)
doc_b_idx += int(doc_b_idx >= doc_a_idx)
doc_b = self.sentence_split(self.get_doc(doc_b_idx))
if not doc_b:
doc_b = None
random_start_b = rng.randint(0, len(doc_b) - 1)
while random_start_b < len(doc_b):
sentence_b = doc_b[random_start_b]
new_b_tokens, new_b_types = self.sentence_tokenize(
sentence_b, 1, random_start_b == 0, random_start_b == len(doc_b))
b_len += len(new_b_tokens)
tokens_b.extend(new_b_tokens)
token_types_b.extend(new_b_types)
if len(tokens_b) >= target_b_length:
break
random_start_b = (random_start_b + 1)
else:
is_random_next = False
for j in range(num_a, len(curr_strs)):
tokens_b.extend(curr_strs[j])
token_types_b.extend(curr_str_types[j])
return (tokens_a, token_types_a), (tokens_b, token_types_b), is_random_next
def truncate_seq_pair(self, a, b, max_seq_len, rng):
"""
Truncate sequence pair according to original BERT implementation:
https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
"""
tokens_a, token_types_a = a
tokens_b, token_types_b = b
max_num_tokens = self.calc_seq_len(max_seq_len)
# max_num_tokens = max_seq_len - 3
while True:
len_a = len(tokens_a)
len_b = len(tokens_b)
total_length = len_a + len_b
if total_length <= max_num_tokens:
break
if len(tokens_a) > len(tokens_b):
trunc_tokens = tokens_a
trunc_types = token_types_a
else:
trunc_tokens = tokens_b
trunc_types = token_types_b
assert len(trunc_tokens) >= 1
if rng.random() < 0.5:
trunc_tokens.pop(0)
trunc_types.pop(0)
else:
trunc_tokens.pop()
trunc_types.pop()
return (tokens_a, token_types_a), (tokens_b, token_types_b)
def calc_seq_len(self, max_seq_len):
return max_seq_len - 3
def mask_token(self, idx, tokens, types, vocab_words, rng):
"""
helper function to mask `idx` token from `tokens` according to
section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf
"""
label = tokens[idx]
if rng.random() < 0.8:
new_label = self.tokenizer.get_command('MASK').Id
else:
if rng.random() < 0.5:
new_label = label
else:
new_label = rng.choice(vocab_words)
tokens[idx] = new_label
return label
def pad_seq(self, seq):
"""helper function to pad sequence pair"""
num_pad = max(0, self.max_seq_len - len(seq))
pad_mask = [0] * len(seq) + [1] * num_pad
seq += [self.tokenizer.get_command('pad').Id] * num_pad
return seq, pad_mask
def concat_tokens(self, tokens_a, token_types_a, tokens_b, token_types_b):
tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command(
'sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id]
token_types = [token_types_a[0]] + token_types_a + \
[token_types_a[0]] + token_types_b + [token_types_b[0]]
return tokens, token_types
def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng):
"""
Mask sequence pair for BERT training according to:
https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338
"""
tokens_a, token_types_a = a
tokens_b, token_types_b = b
tokens, token_types = self.concat_tokens(tokens_a, token_types_a, tokens_b, token_types_b)
len_a = len(tokens_a)
len_b = len(tokens_b)
cand_indices = [idx + 1 for idx in range(len_a)] + [idx + 2 + len_a for idx in range(len_b)]
rng.shuffle(cand_indices)
output_tokens, pad_mask = self.pad_seq(list(tokens))
output_types, _ = self.pad_seq(list(token_types))
num_to_predict = min(max_preds_per_seq, max(1, int(round(len(tokens) * mask_lm_prob))))
mask = [0] * len(output_tokens)
mask_labels = [-1] * len(output_tokens)
for idx in sorted(cand_indices[:num_to_predict]):
mask[idx] = 1
label = self.mask_token(idx, output_tokens, output_types, vocab_words, rng)
mask_labels[idx] = label
return (output_tokens, output_types), mask, mask_labels, pad_mask
# This file is provided as is from:
# https://github.com/huggingface/pytorch-pretrained-BERT
# Please refer to their repository for copyright.
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import (absolute_import, division, print_function, unicode_literals)
import json
import logging
import os
import shutil
import tempfile
from functools import wraps
from hashlib import sha256
import sys
from io import open
import boto3
import requests
from botocore.exceptions import ClientError
from tqdm import tqdm
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try:
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
Path.home() / '.pytorch_pretrained_bert'))
except (AttributeError, ImportError):
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes = url.encode('utf-8')
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode('utf-8')
etag_hash = sha256(etag_bytes)
filename += '.' + etag_hash.hexdigest()
return filename
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise EnvironmentError("file {} not found".format(cache_path))
meta_path = cache_path + '.json'
if not os.path.exists(meta_path):
raise EnvironmentError("file {} not found".format(meta_path))
with open(meta_path, encoding="utf-8") as meta_file:
metadata = json.load(meta_file)
url = metadata['url']
etag = metadata['etag']
return url, etag
def cached_path(url_or_filename, cache_dir=None):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
parsed = urlparse(url_or_filename)
if parsed.scheme in ('http', 'https', 's3'):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir)
elif os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == '':
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
def split_s3_path(url):
"""Split a full s3 path into the bucket name and path."""
parsed = urlparse(url)
if not parsed.netloc or not parsed.path:
raise ValueError("bad s3 path {}".format(url))
bucket_name = parsed.netloc
s3_path = parsed.path
# Remove '/' at beginning of path.
if s3_path.startswith("/"):
s3_path = s3_path[1:]
return bucket_name, s3_path
def s3_request(func):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@wraps(func)
def wrapper(url, *args, **kwargs):
try:
return func(url, *args, **kwargs)
except ClientError as exc:
if int(exc.response["Error"]["Code"]) == 404:
raise EnvironmentError("file {} not found".format(url))
else:
raise
return wrapper
@s3_request
def s3_etag(url):
"""Check ETag on S3 object."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag
@s3_request
def s3_get(url, temp_file):
"""Pull a file directly from S3."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file):
req = requests.get(url, stream=True)
content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache(url, cache_dir=None):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Get eTag to add to filename, if it exists.
if url.startswith("s3://"):
etag = s3_etag(url)
else:
response = requests.head(url, allow_redirects=True)
if response.status_code != 200:
raise IOError("HEAD request failed for url {} with status code {}"
.format(url, response.status_code))
etag = response.headers.get("ETag")
filename = url_to_filename(url, etag)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with tempfile.NamedTemporaryFile() as temp_file:
logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
# GET file object
if url.startswith("s3://"):
s3_get(url, temp_file)
else:
http_get(url, temp_file)
# we are copying the file before closing it, so flush to avoid truncation
temp_file.flush()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file.seek(0)
logger.info("copying %s to cache at %s", temp_file.name, cache_path)
with open(cache_path, 'wb') as cache_file:
shutil.copyfileobj(temp_file, cache_file)
logger.info("creating metadata file for %s", cache_path)
meta = {'url': url, 'etag': etag}
meta_path = cache_path + '.json'
with open(meta_path, 'w', encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
logger.info("removing temp file %s", temp_file.name)
return cache_path
def read_set_from_file(filename):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection = set()
with open(filename, 'r', encoding='utf-8') as file_:
for line in file_:
collection.add(line.rstrip())
return collection
def get_file_extension(path, dot=True, lower=True):
ext = os.path.splitext(path)[1]
ext = ext if dot else ext[1:]
return ext.lower() if lower else ext
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for loading text from disk"""
import os
import mmap
import pickle as pkl
import time
from itertools import accumulate
import torch
from torch.multiprocessing import Lock
def get_lazy_path(path):
"""
Gets directory path where lazy files are stored.
"""
return os.path.splitext(path)[0] + '.lazy'
def exists_lazy(path, data_type='data'):
"""
Check if we've already made a lazy version of this file for the `data_type` field.
"""
if not os.path.exists(get_lazy_path(path)):
return False
contents = os.listdir(get_lazy_path(path))
if data_type not in contents:
return False
if data_type + '.len.pkl' not in contents:
return False
return True
def make_lazy(path, strs, data_type='data'):
"""
Make lazy version of `data_type` field of the file. Byte offsets
corresponding to data indices are stored in a `.len.pkl` data file.
"""
lazypath = get_lazy_path(path)
if not os.path.exists(lazypath):
os.makedirs(lazypath)
datapath = os.path.join(lazypath, data_type)
lenpath = os.path.join(lazypath, data_type + '.len.pkl')
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
with open(datapath, 'wb') as f:
str_lens = []
str_cnt = 0
for s in strs:
if isinstance(s, dict):
s = s['text']
encoded = s.encode('utf-8')
f.write(encoded)
str_cnt = len(encoded)
str_lens.append(str_cnt)
pkl.dump(str_lens, open(lenpath, 'wb'))
else:
while not os.path.exists(lenpath):
time.sleep(1)
def split_strings(strings, start, chr_lens):
"""
Split strings based on string lengths and given start.
"""
return [strings[i - start:j - start] for i, j in zip([start] + chr_lens[:-1], chr_lens)]
class ProcessorTokenizer:
"""
callable class that runs a preprocessing, as well as tokenization step,
on input text.
"""
def __init__(self, tokenizer, process_fn=None):
self.tokenizer = tokenizer
self.process_fn = process_fn
def __call__(self, string):
if self.tokenizer is not None:
string = self.tokenizer(string, process_fn=self.process_fn)
elif self.process_fn is not None:
string = self.process_fn(string)
return string
class lazy_array_loader(object):
"""
Arguments:
path: path to directory where array entries are concatenated into one big string file
and the .len file are located
data_type (str): Some datsets have multiple fields that are stored in different paths.
`data_type` specifies which of these fields to load in this class
mem_map (boolean): Specifies whether to memory map file `path`
map_fn (callable): Fetched strings are passed through map_fn before being returned.
Example of lazy loader directory structure:
file.json
file.lazy/
data_type1
data_type1.len.pkl
data_type2
data_type2.len.pkl
"""
def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
lazypath = get_lazy_path(path)
datapath = os.path.join(lazypath, data_type)
# get file where array entries are concatenated into one big string
self._file = open(datapath, 'rb', buffering=0)
self.file = self._file
# memory map file if necessary
self.mem_map = mem_map
if self.mem_map:
self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
lenpath = os.path.join(lazypath, data_type + '.len.pkl')
self.lens = pkl.load(open(lenpath, 'rb'))
self.ends = list(accumulate(self.lens))
self.dumb_ends = list(self.ends)
self.read_lock = Lock()
self.process_fn = map_fn
self.map_fn = map_fn
self._tokenizer = None
def SetTokenizer(self, tokenizer):
"""
logic to set and remove (set to None) tokenizer.
combines preprocessing/tokenization into one callable.
"""
if tokenizer is None:
if not hasattr(self, '_tokenizer'):
self._tokenizer = tokenizer
else:
self._tokenizer = tokenizer
self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
def GetTokenizer(self):
return self._tokenizer
def __getitem__(self, index):
"""
read file and splice strings based on string ending array `self.ends`
"""
if not isinstance(index, slice):
if index == 0:
start = 0
else:
start = self.ends[index - 1]
end = self.ends[index]
rtn = self.file_read(start, end)
if self.map_fn is not None:
return self.map_fn(rtn)
else:
# if slice, fetch strings with 1 diskread and then splice in memory
chr_lens = self.ends[index]
if index.start == 0 or index.start is None:
start = 0
else:
start = self.ends[index.start - 1]
stop = chr_lens[-1]
strings = self.file_read(start, stop)
rtn = split_strings(strings, start, chr_lens)
if self.map_fn is not None:
return self.map_fn([s for s in rtn])
return rtn
def __len__(self):
return len(self.ends)
def file_read(self, start=0, end=None):
"""read specified portion of file"""
# atomic reads to avoid race conditions with multiprocess dataloader
self.read_lock.acquire()
# seek to start of file read
self.file.seek(start)
# read to end of file if no end point provided
if end is None:
rtn = self.file.read()
# else read amount needed to reach end point
else:
rtn = self.file.read(end - start)
self.read_lock.release()
# TODO: @raulp figure out mem map byte string bug
# if mem map'd need to decode byte string to string
rtn = rtn.decode('utf-8', 'ignore')
# rtn = str(rtn)
if self.mem_map:
rtn = rtn.decode('unicode_escape')
return rtn
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""batch samplers that work with either random or sequential data samplers"""
import math
import os
import sys
import torch
from torch.utils import data
import numpy as np
class RandomSampler(data.sampler.Sampler):
r"""
Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
but this class lets the user set an epoch like DistributedSampler
Samples elements randomly. If without replacement, then sample from a shuffled dataset.
If with replacement, then user can specify ``num_samples`` to draw.
Arguments:
data_source (Dataset): dataset to sample from
num_samples (int): number of samples to draw, default=len(dataset)
replacement (bool): samples are drawn with replacement if ``True``, default=False
"""
def __init__(self, data_source, replacement=False, num_samples=None):
self.data_source = data_source
self.replacement = replacement
self._num_samples = num_samples
self.epoch = -1
if self._num_samples is not None and replacement is False:
raise ValueError("With replacement=False, num_samples should not be specified, "
"since a random permute will be performed.")
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(self.num_samples))
if not isinstance(self.replacement, bool):
raise ValueError("replacement should be a boolean value, but got "
"replacement={}".format(self.replacement))
@property
def num_samples(self):
# dataset size might change at runtime
if self._num_samples is None:
return len(self.data_source)
return self._num_samples
def __iter__(self):
n = len(self.data_source)
g = torch.Generator()
if self.epoch >= 0:
g.manual_seed(self.epoch)
if self.replacement:
return iter(torch.randint(high=n, size=(self.num_samples,),
dtype=torch.int64, generator=g).tolist())
return iter(torch.randperm(n, generator=g).tolist())
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
class DistributedBatchSampler(data.sampler.BatchSampler):
"""
similar to normal implementation of distributed sampler, except implementation is at the
batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
"""
def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False):
super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
if rank == -1:
assert False, 'should not be here'
rank = torch.distributed.get_rank()
self.rank = rank
self.world_size = world_size
self.sampler.wrap_around = 0
self.wrap_around = 0
self.wrap_last = wrap_last
self.start_iter = 0
def __iter__(self):
batch = []
last_batch = None
i = 0
for idx in self.data_iterator(self.sampler, wrap_around=False):
batch.append(idx)
if len(batch) == self.batch_size:
tbatch = self._batch(batch)
if i >= self.start_iter:
yield tbatch
self.start_iter = 0
i += 1
last_batch = np.array(list(tbatch))
batch = []
batch_len = len(batch)
if batch_len > 0 and not self.drop_last:
if self.wrap_last:
self.sampler.wrap_around -= (self.batch_size)
self.wrap_around += (len(batch))
self.wrap_around %= self.batch_size
if isinstance(self.sampler, TransposedSampler):
for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)):
if i == 0:
continue
batch.append(idx)
new_batch_len = len(batch)
if len(batch) == self.batch_size:
break
yield self._batch(batch)
if self.wrap_last:
self.sampler.wrap_around += self.batch_size
def data_iterator(self, _iter, wrap_around=False):
"""iterates through data and handles wrap around"""
for i, idx in enumerate(_iter):
if i < self.wrap_around % self.batch_size:
continue
if wrap_around:
self.wrap_around += 1
self.wrap_around %= self.batch_size
yield idx
def _batch(self, batch):
"""extracts samples only pertaining to this worker's batch"""
start = self.rank * self.batch_size // self.world_size
end = (self.rank + 1) * self.batch_size // self.world_size
return batch[start:end]
"""
Usage:
python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
"""
import sys
import json
import nltk
nltk.download('punkt')
input_file = sys.argv[1]
output_file = sys.argv[2]
line_seperator = "\n"
with open(input_file, 'r') as ifile:
with open(output_file, "w") as ofile:
for doc in ifile.readlines():
parsed = json.loads(doc)
sent_list = []
for line in parsed['text'].split('\n'):
if line != '\n':
sent_list.extend(nltk.tokenize.sent_tokenize(line))
parsed['text'] = line_seperator.join(sent_list)
ofile.write(json.dumps(parsed) + '\n')
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Takes a corpora of files (specified by `--input_files`) with json data separated
by newlines (loose json). Splits data into train.json, val.json, test.json files
under `output_dir`.
Note: This code has the potential to override files with the names
train.json, val.json, test.json in `--output_dir`.
"""
import os
import argparse
import math
import random
parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
parser.add_argument('--input_files', nargs='+', required=True,
help='whitespace separated list of input data files')
parser.add_argument('--output_dir', required=True,
help='output directory where to put files')
parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
help='percentage of available data to use for val/test dataset')
args = parser.parse_args()
def get_lines(filepath):
lines = []
with open(filepath, 'r') as f:
for i, l in enumerate(f.readlines()):
l = l.strip()
lines.append(l)
return lines
def get_splits(lines, line_counts):
all_lines = []
line_idx = []
file_mappings = []
for i, l in enumerate(lines):
all_lines.extend(l)
line_idx.extend(list(range(len(l))))
file_mappings.extend([i] * len(l))
indices = list(range(len(all_lines)))
random.shuffle(indices)
all_lines = [all_lines[idx] for idx in indices]
line_idx = [line_idx[idx] for idx in indices]
file_mappings = [file_mappings[idx] for idx in indices]
splits = []
mappings = []
start = 0
for end in line_counts:
end += start
splits.append(all_lines[start:end])
mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
start = end
return splits, mappings
def format_mappings(line_idx, file_mappings):
lines = []
for m, l in zip(file_mappings, line_idx):
lines.append(str(m).strip() + '\t' + str(l).strip())
return lines
def get_filepaths(filepaths, output_dir):
paths = []
train_path = 'train.json'
dev_path = 'dev.json'
test_path = 'test.json'
paths.append(os.path.join(output_dir, train_path))
paths.append(os.path.join(output_dir, dev_path))
paths.append(os.path.join(output_dir, test_path))
return paths
def write_files(lines, mappings, filepaths):
for l, m, path in zip(lines, mappings, filepaths):
write_file(l, path)
write_mapping_file(m, path)
def write_file(lines, path):
print('Writing:', path)
with open(path, 'w') as f:
for l in lines:
f.write(l + '\n')
def write_mapping_file(m, path):
path = path + '.map'
m = [get_mapping_header()] + m
write_file(m, path)
def get_mapping_header():
return 'file\tline #'
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
lines = []
for filepath in args.input_files:
_lines = get_lines(filepath)
lines.append(_lines)
# calculate number of lines to use for each
line_counts = [len(l) for l in lines]
total_lines = sum(line_counts)
dev_percent = args.test_percent[0]
dev_lines = math.ceil(dev_percent * total_lines)
test_percent = 0
if len(args.test_percent) == 2:
test_percent = args.test_percent[1]
test_lines = math.ceil(test_percent * total_lines)
train_lines = total_lines - (test_lines + dev_lines)
normed_lines = [train_lines, dev_lines, test_lines]
normed_lines = [int(l) for l in normed_lines]
splits, mappings = get_splits(lines, normed_lines)
filepaths = get_filepaths(args.input_files, args.output_dir)
print('Writing output to:', filepaths)
write_files(splits, mappings, filepaths)
"""
Takes a corpora of files (specified by `--input_files`) with json data separated
by newlines (loose json). Splits data into train.json, val.json, test.json files
under `output_dir`.
Note: This code has the potential to override files with the names
train.json, val.json, test.json in `--output_dir`.
"""
import os
import argparse
import math
import random
parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
parser.add_argument('--input_files', nargs='+', required=True,
help='whitespace separated list of input data files')
parser.add_argument('--output_dir', required=True,
help='output directory where to put files')
parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
help='percentage of available data to use for val/test dataset')
args = parser.parse_args()
def get_lines(filepath):
lines = []
with open(filepath, 'r') as f:
for i, l in enumerate(f.readlines()):
l = l.strip()
lines.append(l)
return lines
def get_splits(lines, line_counts):
all_lines = []
line_idx = []
file_mappings = []
for i, l in enumerate(lines):
all_lines.extend(l)
line_idx.extend(list(range(len(l))))
file_mappings.extend([i] * len(l))
indices = list(range(len(all_lines)))
random.shuffle(indices)
all_lines = [all_lines[idx] for idx in indices]
line_idx = [line_idx[idx] for idx in indices]
file_mappings = [file_mappings[idx] for idx in indices]
splits = []
mappings = []
start = 0
for end in line_counts:
end += start
splits.append(all_lines[start:end])
mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
start = end
return splits, mappings
def format_mappings(line_idx, file_mappings):
lines = []
for m, l in zip(file_mappings, line_idx):
lines.append(str(m).strip() + '\t' + str(l).strip())
return lines
def get_filepaths(filepaths, output_dir):
paths = []
train_path = 'train.json'
dev_path = 'dev.json'
test_path = 'test.json'
paths.append(os.path.join(output_dir, train_path))
paths.append(os.path.join(output_dir, dev_path))
paths.append(os.path.join(output_dir, test_path))
return paths
def write_files(lines, mappings, filepaths):
for l, m, path in zip(lines, mappings, filepaths):
write_file(l, path)
write_mapping_file(m, path)
def write_file(lines, path):
print('Writing:', path)
with open(path, 'w') as f:
for l in lines:
f.write(l + '\n')
def write_mapping_file(m, path):
path = path + '.map'
m = [get_mapping_header()] + m
write_file(m, path)
def get_mapping_header():
return 'file\tline #'
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
lines = []
for filepath in args.input_files:
_lines = get_lines(filepath)
lines.append(_lines)
# calculate number of lines to use for each
line_counts = [len(l) for l in lines]
total_lines = sum(line_counts)
dev_percent = args.test_percent[0]
dev_lines = math.ceil(dev_percent * total_lines)
test_percent = 0
if len(args.test_percent) == 2:
test_percent = args.test_percent[1]
test_lines = math.ceil(test_percent * total_lines)
train_lines = total_lines - (test_lines + dev_lines)
normed_lines = [train_lines, dev_lines, test_lines]
normed_lines = [int(l) for l in normed_lines]
splits, mappings = get_splits(lines, normed_lines)
filepaths = get_filepaths(args.input_files, args.output_dir)
print('Writing output to:', filepaths)
write_files(splits, mappings, filepaths)
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch DataLoader for TFRecords"""
import numpy as np
import torch
import queue
import threading
import tensorflow as tf
tf.enable_eager_execution()
class TFRecordDataLoader(object):
def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq,
train, num_workers=2, seed=1, threaded_dl=False):
assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords"
tf.set_random_seed(seed)
if isinstance(records, str):
records = [records]
self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
"input_mask": tf.FixedLenFeature([max_seq_len], tf.int64),
"segment_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
"masked_lm_positions": tf.FixedLenFeature([max_preds_per_seq], tf.int64),
"masked_lm_ids": tf.FixedLenFeature([max_preds_per_seq], tf.int64),
"masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32),
"next_sentence_labels": tf.FixedLenFeature([1], tf.int64)})
# Instantiate dataset according to original BERT implementation
if train:
self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records))
self.dataset = self.dataset.repeat()
self.dataset = self.dataset.shuffle(buffer_size=len(records))
# use sloppy tfrecord dataset
self.dataset = self.dataset.apply(
tf.contrib.data.parallel_interleave(
tf.data.TFRecordDataset,
sloppy=train,
cycle_length=min(num_workers, len(records))))
self.dataset = self.dataset.shuffle(buffer_size=100)
else:
self.dataset = tf.data.TFRecordDataset(records)
self.dataset = self.dataset.repeat()
# Instantiate dataloader (do not drop remainder for eval)
loader_args = {'batch_size': batch_size,
'num_parallel_batches': num_workers,
'drop_remainder': train}
self.dataloader = self.dataset.apply(
tf.contrib.data.map_and_batch(
self.record_converter, **loader_args))
self.threaded_dl = threaded_dl
self.num_workers = num_workers
def __iter__(self):
if self.threaded_dl:
data_iter = iter(MultiprocessLoader(self.dataloader, self.num_workers))
for item in data_iter:
yield item
else:
data_iter = iter(self.dataloader)
for item in data_iter:
yield convert_tf_example_to_torch_tensors(item)
class Record2Example(object):
def __init__(self, feature_map):
self.feature_map = feature_map
def __call__(self, record):
"""Decodes a BERT TF record to a TF example."""
example = tf.parse_single_example(record, self.feature_map)
for k, v in list(example.items()):
if v.dtype == tf.int64:
example[k] = tf.to_int32(v)
return example
def convert_tf_example_to_torch_tensors(example):
item = {k: (v.numpy()) for k, v in example.items()}
mask = np.zeros_like(item['input_ids'])
mask_labels = np.ones_like(item['input_ids']) * -1
for b, row in enumerate(item['masked_lm_positions'].astype(int)):
for i, idx in enumerate(row):
if item['masked_lm_weights'][b, i] != 0:
mask[b, idx] = 1
mask_labels[b, idx] = item['masked_lm_ids'][b, i]
output = {'text': item['input_ids'], 'types': item['segment_ids'], 'is_random': item['next_sentence_labels'],
'pad_mask': 1 - item['input_mask'], 'mask': mask, 'mask_labels': mask_labels}
return {k: torch.from_numpy(v) for k, v in output.items()}
class MultiprocessLoader(object):
def __init__(self, dataloader, num_workers=2):
self.dl = dataloader
self.queue_size = 2 * num_workers
def __iter__(self):
output_queue = queue.Queue(self.queue_size)
output_thread = threading.Thread(target=_multiproc_iter,
args=(self.dl, output_queue))
output_thread.daemon = True
output_thread.start()
while output_thread.is_alive():
yield output_queue.get(block=True)
else:
print(RuntimeError('TF record data loader thread exited unexpectedly'))
def _multiproc_iter(dl, output_queue):
data_iter = iter(dl)
for item in data_iter:
tensors = convert_tf_example_to_torch_tensors(item)
output_queue.put(tensors, block=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment