Commit cf0100cf authored by Neel Kant's avatar Neel Kant
Browse files

Restructure BertDataset to help with RealmDataset

parent f332d7e1
...@@ -152,6 +152,7 @@ class BertDataset(Dataset): ...@@ -152,6 +152,7 @@ class BertDataset(Dataset):
self.sep_id = tokenizer.sep self.sep_id = tokenizer.sep
self.mask_id = tokenizer.mask self.mask_id = tokenizer.mask
self.pad_id = tokenizer.pad self.pad_id = tokenizer.pad
self.build_sample_fn = build_training_sample
def __len__(self): def __len__(self):
...@@ -159,21 +160,18 @@ class BertDataset(Dataset): ...@@ -159,21 +160,18 @@ class BertDataset(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
start_idx, end_idx, seq_length = self.samples_mapping[idx]
start_index, end_index, seq_length = self.samples_mapping[idx] sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
sample = []
for index in range(start_index, end_index):
sample.append(self.indexed_dataset[index])
# Note that this rng state should be numpy and not python since # Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive. # python randint is inclusive whereas the numpy one is exclusive.
np_rng = np.random.RandomState(seed=(self.seed + idx)) np_rng = np.random.RandomState(seed=(self.seed + idx))
return build_training_sample(sample, seq_length, return self.build_sample_fn(sample, seq_length,
self.max_seq_length, # needed for padding self.max_seq_length, # needed for padding
self.vocab_id_list, self.vocab_id_list,
self.vocab_id_to_token_dict, self.vocab_id_to_token_dict,
self.cls_id, self.sep_id, self.cls_id, self.sep_id,
self.mask_id, self.pad_id, self.mask_id, self.pad_id,
self.masked_lm_prob, np_rng) self.masked_lm_prob, np_rng)
def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
......
...@@ -82,33 +82,6 @@ def build_training_sample(sample, ...@@ -82,33 +82,6 @@ def build_training_sample(sample,
return train_sample return train_sample
def build_simple_training_sample(sample, target_seq_length, max_seq_length,
vocab_id_list, vocab_id_to_token_dict,
cls_id, sep_id, mask_id, pad_id,
masked_lm_prob, np_rng):
tokens = list(itertools.chain(*sample))[:max_seq_length - 2]
tokens, tokentypes = create_single_tokens_and_tokentypes(tokens)
max_predictions_per_seq = masked_lm_prob * max_seq_length
(tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
= pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
masked_labels, pad_id, max_seq_length)
train_sample = {
'text': tokens_np,
'types': tokentypes_np,
'labels': labels_np,
'loss_mask': loss_mask_np,
'padding_mask': padding_mask_np}
return train_sample
def get_a_and_b_segments(sample, np_rng): def get_a_and_b_segments(sample, np_rng):
"""Divide sample into a and b segments.""" """Divide sample into a and b segments."""
...@@ -188,15 +161,6 @@ def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): ...@@ -188,15 +161,6 @@ def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
return tokens, tokentypes return tokens, tokentypes
def create_single_tokens_and_tokentypes(_tokens, cls_id, sep_id):
tokens = []
tokens.append(cls_id)
tokens.extend(list(_tokens))
tokens.append(sep_id)
tokentypes = [0] * len(tokens)
return tokens, tokentypes
MaskedLmInstance = collections.namedtuple("MaskedLmInstance", MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"]) ["index", "label"])
......
import itertools
import numpy as np import numpy as np
import spacy import spacy
from torch.utils.data import Dataset
from megatron import get_tokenizer from megatron import get_tokenizer
from megatron.data.bert_dataset import get_samples_mapping_ from megatron.data.bert_dataset import BertDataset, get_samples_mapping_
from megatron.data.dataset_utils import build_simple_training_sample from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
qa_nlp = spacy.load('en_core_web_lg') qa_nlp = spacy.load('en_core_web_lg')
class RealmDataset(Dataset):
class RealmDataset(BertDataset):
"""Dataset containing simple masked sentences for masked language modeling. """Dataset containing simple masked sentences for masked language modeling.
The dataset should yield sentences just like the regular BertDataset The dataset should yield sentences just like the regular BertDataset
...@@ -21,52 +23,48 @@ class RealmDataset(Dataset): ...@@ -21,52 +23,48 @@ class RealmDataset(Dataset):
def __init__(self, name, indexed_dataset, data_prefix, def __init__(self, name, indexed_dataset, data_prefix,
num_epochs, max_num_samples, masked_lm_prob, num_epochs, max_num_samples, masked_lm_prob,
max_seq_length, short_seq_prob, seed): max_seq_length, short_seq_prob, seed):
super(RealmDataset, self).__init__(name, indexed_dataset, data_prefix,
num_epochs, max_num_samples, masked_lm_prob,
max_seq_length, short_seq_prob, seed)
self.build_sample_fn = build_simple_training_sample
def build_simple_training_sample(sample, target_seq_length, max_seq_length,
vocab_id_list, vocab_id_to_token_dict,
cls_id, sep_id, mask_id, pad_id,
masked_lm_prob, np_rng):
tokens = list(itertools.chain(*sample))[:max_seq_length - 2]
tokens, tokentypes = create_single_tokens_and_tokentypes(tokens, cls_id, sep_id)
max_predictions_per_seq = masked_lm_prob * max_seq_length
(tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
= pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
masked_labels, pad_id, max_seq_length)
# REALM true sequence length is twice as long but none of that is to be predicted with LM
loss_mask_np = np.concatenate((loss_mask_np, np.ones(loss_mask_np.shape)), -1)
train_sample = {
'tokens': tokens_np,
'labels': labels_np,
'loss_mask': loss_mask_np,
'pad_mask': padding_mask_np
}
return train_sample
# Params to store. def create_single_tokens_and_tokentypes(_tokens, cls_id, sep_id):
self.name = name tokens = []
self.seed = seed tokens.append(cls_id)
self.masked_lm_prob = masked_lm_prob tokens.extend(list(_tokens))
self.max_seq_length = max_seq_length tokens.append(sep_id)
tokentypes = [0] * len(tokens)
# Dataset. return tokens, tokentypes
self.indexed_dataset = indexed_dataset
# Build the samples mapping.
self.samples_mapping = get_samples_mapping_(self.indexed_dataset,
data_prefix,
num_epochs,
max_num_samples,
self.max_seq_length,
short_seq_prob,
self.seed,
self.name)
# Vocab stuff.
tokenizer = get_tokenizer()
self.vocab_id_list = list(tokenizer.inv_vocab.keys())
self.vocab_id_to_token_dict = tokenizer.inv_vocab
self.cls_id = tokenizer.cls
self.sep_id = tokenizer.sep
self.mask_id = tokenizer.mask
self.pad_id = tokenizer.pad
def __len__(self):
return self.samples_mapping.shape[0]
def __getitem__(self, idx):
start_idx, end_idx, seq_length = self.samples_mapping[idx]
sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng = np.random.RandomState(seed=(self.seed + idx))
return build_simple_training_sample(sample, seq_length,
self.max_seq_length, # needed for padding
self.vocab_id_list,
self.vocab_id_to_token_dict,
self.cls_id, self.sep_id,
self.mask_id, self.pad_id,
self.masked_lm_prob, np_rng)
def spacy_ner(block_text): def spacy_ner(block_text):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment