realm_dataset.py 2.69 KB
Newer Older
1
2
import itertools

Neel Kant's avatar
Neel Kant committed
3
import numpy as np
Neel Kant's avatar
Neel Kant committed
4
import spacy
Neel Kant's avatar
Neel Kant committed
5
6

from megatron import get_tokenizer
7
8
from megatron.data.bert_dataset import BertDataset, get_samples_mapping_
from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
Neel Kant's avatar
Neel Kant committed
9

Neel Kant's avatar
Neel Kant committed
10
11
#qa_nlp = spacy.load('en_core_web_lg')
qa_nlp = None
12
13

class RealmDataset(BertDataset):
Neel Kant's avatar
Neel Kant committed
14
15
16
17
18
19
20
21
22
    """Dataset containing simple masked sentences for masked language modeling.

    The dataset should yield sentences just like the regular BertDataset
    However, this dataset also needs to be able to return a set of blocks
    given their start and end indices.

    Presumably

    """
Neel Kant's avatar
Neel Kant committed
23
24
25
    def __init__(self, name, indexed_dataset, data_prefix,
                 num_epochs, max_num_samples, masked_lm_prob,
                 max_seq_length, short_seq_prob, seed):
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        super(RealmDataset, self).__init__(name, indexed_dataset, data_prefix,
                                           num_epochs, max_num_samples, masked_lm_prob,
                                           max_seq_length, short_seq_prob, seed)
        self.build_sample_fn = build_simple_training_sample


def build_simple_training_sample(sample, target_seq_length, max_seq_length,
                                 vocab_id_list, vocab_id_to_token_dict,
                                 cls_id, sep_id, mask_id, pad_id,
                                 masked_lm_prob, np_rng):

    tokens = list(itertools.chain(*sample))[:max_seq_length - 2]
    tokens, tokentypes = create_single_tokens_and_tokentypes(tokens, cls_id, sep_id)

    max_predictions_per_seq = masked_lm_prob * max_seq_length
    (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)

    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
                                   masked_labels, pad_id, max_seq_length)

    train_sample = {
        'tokens': tokens_np,
        'labels': labels_np,
        'loss_mask': loss_mask_np,
        'pad_mask': padding_mask_np
    }
    return train_sample

Neel Kant's avatar
Neel Kant committed
57

58
59
60
61
62
63
64
def create_single_tokens_and_tokentypes(_tokens, cls_id, sep_id):
    tokens = []
    tokens.append(cls_id)
    tokens.extend(list(_tokens))
    tokens.append(sep_id)
    tokentypes = [0] * len(tokens)
    return tokens, tokentypes
Neel Kant's avatar
Neel Kant committed
65

Neel Kant's avatar
Neel Kant committed
66
67
68
69
70
71
72
73
74
75
76

def spacy_ner(block_text):
    candidates = {}
    block = qa_nlp(block_text)
    starts = []
    answers = []
    for ent in block.ents:
        starts.append(int(ent.start_char))
        answers.append(str(ent.text))
    candidates['starts'] = starts
    candidates['answers'] = answers