test_retrieval_realm.py 6.78 KB
Newer Older
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import shutil
import tempfile
from unittest import TestCase
from unittest.mock import patch

import numpy as np
from datasets import Dataset

from transformers.models.realm.configuration_realm import RealmConfig
from transformers.models.realm.retrieval_realm import _REALM_BLOCK_RECORDS_FILENAME, RealmRetriever
from transformers.models.realm.tokenization_realm import VOCAB_FILES_NAMES, RealmTokenizer


class RealmRetrieverTest(TestCase):
    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
        self.num_block_records = 5

        # Realm tok
        vocab_tokens = [
            "[UNK]",
            "[CLS]",
            "[SEP]",
            "[PAD]",
            "[MASK]",
            "test",
            "question",
            "this",
            "is",
            "the",
            "first",
            "second",
            "third",
            "fourth",
            "fifth",
            "record",
            "want",
            "##want",
            "##ed",
            "wa",
            "un",
            "runn",
            "##ing",
            ",",
            "low",
            "lowest",
        ]
        realm_tokenizer_path = os.path.join(self.tmpdirname, "realm_tokenizer")
        os.makedirs(realm_tokenizer_path, exist_ok=True)
        self.vocab_file = os.path.join(realm_tokenizer_path, VOCAB_FILES_NAMES["vocab_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

        realm_block_records_path = os.path.join(self.tmpdirname, "realm_block_records")
        os.makedirs(realm_block_records_path, exist_ok=True)

    def get_tokenizer(self) -> RealmTokenizer:
        return RealmTokenizer.from_pretrained(os.path.join(self.tmpdirname, "realm_tokenizer"))

    def tearDown(self):
        shutil.rmtree(self.tmpdirname)

    def get_config(self):
        config = RealmConfig(num_block_records=self.num_block_records)
        return config

    def get_dummy_dataset(self):
        dataset = Dataset.from_dict(
            {
                "id": ["0", "1"],
                "question": ["foo", "bar"],
                "answers": [["Foo", "Bar"], ["Bar"]],
            }
        )
        return dataset

    def get_dummy_block_records(self):
        block_records = np.array(
            [
                b"This is the first record",
                b"This is the second record",
                b"This is the third record",
                b"This is the fourth record",
                b"This is the fifth record",
101
                b"This is a longer longer longer record",
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
102
            ],
Sanchit Gandhi's avatar
Sanchit Gandhi committed
103
            dtype=object,
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
        )
        return block_records

    def get_dummy_retriever(self):
        retriever = RealmRetriever(
            block_records=self.get_dummy_block_records(),
            tokenizer=self.get_tokenizer(),
        )
        return retriever

    def test_retrieve(self):
        config = self.get_config()
        retriever = self.get_dummy_retriever()
        tokenizer = retriever.tokenizer

Sanchit Gandhi's avatar
Sanchit Gandhi committed
119
        retrieved_block_ids = np.array([0, 3], dtype="long")
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
        question_input_ids = tokenizer(["Test question"]).input_ids
        answer_ids = tokenizer(
            ["the fourth"],
            add_special_tokens=False,
            return_token_type_ids=False,
            return_attention_mask=False,
        ).input_ids
        max_length = config.reader_seq_len

        has_answers, start_pos, end_pos, concat_inputs = retriever(
            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
        )

        self.assertEqual(len(has_answers), 2)
        self.assertEqual(len(start_pos), 2)
        self.assertEqual(len(end_pos), 2)
        self.assertEqual(concat_inputs.input_ids.shape, (2, 10))
        self.assertEqual(concat_inputs.attention_mask.shape, (2, 10))
        self.assertEqual(concat_inputs.token_type_ids.shape, (2, 10))
139
        self.assertEqual(concat_inputs.special_tokens_mask.shape, (2, 10))
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        self.assertEqual(
            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[0]),
            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "first", "record", "[SEP]"],
        )
        self.assertEqual(
            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[1]),
            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "fourth", "record", "[SEP]"],
        )

    def test_block_has_answer(self):
        config = self.get_config()
        retriever = self.get_dummy_retriever()
        tokenizer = retriever.tokenizer

Sanchit Gandhi's avatar
Sanchit Gandhi committed
154
        retrieved_block_ids = np.array([0, 3, 5], dtype="long")
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
155
156
        question_input_ids = tokenizer(["Test question"]).input_ids
        answer_ids = tokenizer(
157
            ["the fourth", "longer longer"],
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
158
159
160
161
162
163
164
165
166
167
            add_special_tokens=False,
            return_token_type_ids=False,
            return_attention_mask=False,
        ).input_ids
        max_length = config.reader_seq_len

        has_answers, start_pos, end_pos, _ = retriever(
            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
        )

168
169
170
        self.assertEqual([False, True, True], has_answers)
        self.assertEqual([[-1, -1, -1], [6, -1, -1], [6, 7, 8]], start_pos)
        self.assertEqual([[-1, -1, -1], [7, -1, -1], [7, 8, 9]], end_pos)
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184

    def test_save_load_pretrained(self):
        retriever = self.get_dummy_retriever()
        retriever.save_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))

        # Test local path
        retriever = retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
        self.assertEqual(retriever.block_records[0], b"This is the first record")

        # Test mocked remote path
        with patch("transformers.models.realm.retrieval_realm.hf_hub_download") as mock_hf_hub_download:
            mock_hf_hub_download.return_value = os.path.join(
                os.path.join(self.tmpdirname, "realm_block_records"), _REALM_BLOCK_RECORDS_FILENAME
            )
185
            retriever = RealmRetriever.from_pretrained("google/realm-cc-news-pretrained-openqa")
Li-Huai (Allan) Lin's avatar
Li-Huai (Allan) Lin committed
186
187

        self.assertEqual(retriever.block_records[0], b"This is the first record")