realm_dataset.py 2.88 KB
Newer Older
1
2
import itertools

Neel Kant's avatar
Neel Kant committed
3
import numpy as np
Neel Kant's avatar
Neel Kant committed
4
import spacy
Neel Kant's avatar
Neel Kant committed
5
6

from megatron import get_tokenizer
7
8
from megatron.data.bert_dataset import BertDataset, get_samples_mapping_
from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
Neel Kant's avatar
Neel Kant committed
9

Neel Kant's avatar
Neel Kant committed
10
11
#qa_nlp = spacy.load('en_core_web_lg')
qa_nlp = None
12
13

class RealmDataset(BertDataset):
Neel Kant's avatar
Neel Kant committed
14
15
16
17
18
19
20
21
22
    """Dataset containing simple masked sentences for masked language modeling.

    The dataset should yield sentences just like the regular BertDataset
    However, this dataset also needs to be able to return a set of blocks
    given their start and end indices.

    Presumably

    """
Neel Kant's avatar
Neel Kant committed
23
24
25
    def __init__(self, name, indexed_dataset, data_prefix,
                 num_epochs, max_num_samples, masked_lm_prob,
                 max_seq_length, short_seq_prob, seed):
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
        super(RealmDataset, self).__init__(name, indexed_dataset, data_prefix,
                                           num_epochs, max_num_samples, masked_lm_prob,
                                           max_seq_length, short_seq_prob, seed)
        self.build_sample_fn = build_simple_training_sample


def build_simple_training_sample(sample, target_seq_length, max_seq_length,
                                 vocab_id_list, vocab_id_to_token_dict,
                                 cls_id, sep_id, mask_id, pad_id,
                                 masked_lm_prob, np_rng):

    tokens = list(itertools.chain(*sample))[:max_seq_length - 2]
    tokens, tokentypes = create_single_tokens_and_tokentypes(tokens, cls_id, sep_id)

    max_predictions_per_seq = masked_lm_prob * max_seq_length
    (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)

    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
                                   masked_labels, pad_id, max_seq_length)

    # REALM true sequence length is twice as long but none of that is to be predicted with LM
Neel Kant's avatar
Neel Kant committed
50
    # loss_mask_np = np.concatenate((loss_mask_np, np.ones(loss_mask_np.shape)), -1).astype(np.int64)
51
52
53
54
55
56
57
58
59

    train_sample = {
        'tokens': tokens_np,
        'labels': labels_np,
        'loss_mask': loss_mask_np,
        'pad_mask': padding_mask_np
    }
    return train_sample

Neel Kant's avatar
Neel Kant committed
60

61
62
63
64
65
66
67
def create_single_tokens_and_tokentypes(_tokens, cls_id, sep_id):
    tokens = []
    tokens.append(cls_id)
    tokens.extend(list(_tokens))
    tokens.append(sep_id)
    tokentypes = [0] * len(tokens)
    return tokens, tokentypes
Neel Kant's avatar
Neel Kant committed
68

Neel Kant's avatar
Neel Kant committed
69
70
71
72
73
74
75
76
77
78
79

def spacy_ner(block_text):
    candidates = {}
    block = qa_nlp(block_text)
    starts = []
    answers = []
    for ent in block.ents:
        starts.append(int(ent.start_char))
        answers.append(str(ent.text))
    candidates['starts'] = starts
    candidates['answers'] = answers