realm_dataset.py 4.58 KB
Newer Older
1
import collections
2
import itertools
3
import random
4

Neel Kant's avatar
Neel Kant committed
5
import numpy as np
6
from torch.utils.data import Dataset
Neel Kant's avatar
Neel Kant committed
7

8
from megatron import get_tokenizer
Neel Kant's avatar
Neel Kant committed
9
from megatron.data.realm_dataset_utils import BlockSampleData, get_block_samples_mapping, join_str_list
10

11

12
class ICTDataset(Dataset):
13
14
    """Dataset containing sentences and their blocks for an inverse cloze task."""
    def __init__(self, name, block_dataset, title_dataset, data_prefix,
Neel Kant's avatar
Neel Kant committed
15
16
                 num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
                 short_seq_prob, seed, use_titles=True, use_one_sent_docs=False):
17
18
19
        self.name = name
        self.seed = seed
        self.max_seq_length = max_seq_length
Neel Kant's avatar
Neel Kant committed
20
        self.query_in_block_prob = query_in_block_prob
21
22
23
24
        self.block_dataset = block_dataset
        self.title_dataset = title_dataset
        self.short_seq_prob = short_seq_prob
        self.rng = random.Random(self.seed)
25
        self.use_titles = use_titles
Neel Kant's avatar
Neel Kant committed
26
        self.use_one_sent_docs = use_one_sent_docs
27

28
29
        self.samples_mapping = get_block_samples_mapping(
            block_dataset, title_dataset, data_prefix, num_epochs,
Neel Kant's avatar
Neel Kant committed
30
            max_num_samples, max_seq_length, seed, name, use_one_sent_docs)
31
32
33
34
35
36
37
38
39
40
41
42
        self.tokenizer = get_tokenizer()
        self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_list = self.tokenizer.inv_vocab
        self.cls_id = self.tokenizer.cls
        self.sep_id = self.tokenizer.sep
        self.mask_id = self.tokenizer.mask
        self.pad_id = self.tokenizer.pad

    def __len__(self):
        return self.samples_mapping.shape[0]

    def __getitem__(self, idx):
43
        """Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
44
45
46
        sample_data = self.samples_mapping[idx]
        start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple()

47
        if self.use_titles:
48
            title = self.title_dataset[int(doc_idx)]
49
50
51
52
            title_pad_offset = 3 + len(title)
        else:
            title = None
            title_pad_offset = 2
53
        block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
Neel Kant's avatar
Neel Kant committed
54
        assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1
55

56
        # randint() is inclusive for Python rng
57
        rand_sent_idx = self.rng.randint(0, len(block) - 1)
58

59
        # keep the query in the context query_in_block_prob fraction of the time.
Neel Kant's avatar
Neel Kant committed
60
        if self.rng.random() < self.query_in_block_prob:
61
62
63
64
65
66
67
            query = block[rand_sent_idx].copy()
        else:
            query = block.pop(rand_sent_idx)

        # still need to truncate because blocks are concluded when
        # the sentence lengths have exceeded max_seq_length.
        query = query[:self.max_seq_length - 2]
68
        block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset]
69
70
71

        query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
72
        block_data = sample_data.as_array()
73
74

        sample = {
75
76
77
78
79
            'query_tokens': query_tokens,
            'query_pad_mask': query_pad_mask,
            'block_tokens': block_tokens,
            'block_pad_mask': block_pad_mask,
            'block_data': block_data,
80
81
82
83
84
85
        }

        return sample

    def get_block(self, start_idx, end_idx, doc_idx):
        """Get the IDs for an evidence block plus the title of the corresponding document"""
86
87
        block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
        title = self.title_dataset[int(doc_idx)]
88
89
90
91

        block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))]
        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)

92
        return block_tokens, block_pad_mask
93
94

    def get_null_block(self):
95
        """Get empty block and title - used in REALM pretraining"""
96
97
98
        block, title = [], []
        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)

99
        return block_tokens, block_pad_mask
100
101

    def concat_and_pad_tokens(self, tokens, title=None):
102
        """Concat with special tokens and pad sequence to self.max_seq_length"""
103
        tokens = list(tokens)
104
105
106
        if title is None:
            tokens = [self.cls_id] + tokens + [self.sep_id]
        else:
107
            title = list(title)
108
            tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id]
109
        assert len(tokens) <= self.max_seq_length
110
111
112
113

        num_pad = self.max_seq_length - len(tokens)
        pad_mask = [1] * len(tokens) + [0] * num_pad
        tokens += [self.pad_id] * num_pad
114
115

        return np.array(tokens), np.array(pad_mask)