test_pipelines_fill_mask.py 13 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
16
17
import unittest

from transformers import pipeline
18
from transformers.testing_utils import nested_simplify, require_tf, require_torch, slow
19
20
21
22
23
24

from .test_pipelines_common import MonoInputPipelineCommonMixin


EXPECTED_FILL_MASK_RESULT = [
    [
25
26
        {"sequence": "My name is John", "score": 0.00782308354973793, "token": 610, "token_str": " John"},
        {"sequence": "My name is Chris", "score": 0.007475061342120171, "token": 1573, "token_str": " Chris"},
27
28
29
    ],
    [
        {
30
31
32
33
            "sequence": "The largest city in France is Paris",
            "score": 0.2510891854763031,
            "token": 2201,
            "token_str": " Paris",
34
35
        },
        {
36
37
38
39
            "sequence": "The largest city in France is Lyon",
            "score": 0.21418564021587372,
            "token": 12790,
            "token_str": " Lyon",
40
        },
41
    ],
42
43
]

44
45
EXPECTED_FILL_MASK_TARGET_RESULT = [EXPECTED_FILL_MASK_RESULT[0]]

46
47
48

class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
    pipeline_task = "fill-mask"
49
    pipeline_loading_kwargs = {"top_k": 2}
50
51
52
53
54
55
56
57
58
59
60
61
62
    small_models = ["sshleifer/tiny-distilroberta-base"]  # Models tested without the @slow decorator
    large_models = ["distilroberta-base"]  # Models tested with the @slow decorator
    mandatory_keys = {"sequence", "score", "token"}
    valid_inputs = [
        "My name is <mask>",
        "The largest city in France is <mask>",
    ]
    invalid_inputs = [
        "This is <mask> <mask>"  # More than 1 mask_token in the input is not supported
        "This is"  # No mask_token is not supported
    ]
    expected_check_keys = ["sequence"]

63
64
65
    @require_torch
    def test_torch_fill_mask(self):
        valid_inputs = "My name is <mask>"
66
67
        unmasker = pipeline(task="fill-mask", model=self.small_models[0])
        outputs = unmasker(valid_inputs)
68
69
70
        self.assertIsInstance(outputs, list)

        # This passes
71
        outputs = unmasker(valid_inputs, targets=[" Patrick", " Clara"])
72
73
74
        self.assertIsInstance(outputs, list)

        # This used to fail with `cannot mix args and kwargs`
75
        outputs = unmasker(valid_inputs, something=False)
76
77
        self.assertIsInstance(outputs, list)

78
79
80
    @require_torch
    def test_torch_fill_mask_with_targets(self):
        valid_inputs = ["My name is <mask>"]
81
82
        # ' Sam' will yield a warning but work
        valid_targets = [[" Teven", "臓Patrick", "臓Clara"], ["臓Sam"], [" Sam"]]
83
84
        invalid_targets = [[], [""], ""]
        for model_name in self.small_models:
85
            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
86
            for targets in valid_targets:
87
                outputs = unmasker(valid_inputs, targets=targets)
88
89
90
                self.assertIsInstance(outputs, list)
                self.assertEqual(len(outputs), len(targets))
            for targets in invalid_targets:
91
                self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets)
92

93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    @require_torch
    @slow
    def test_torch_fill_mask_targets_equivalence(self):
        model_name = self.large_models[0]
        unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
        unmasked = unmasker(self.valid_inputs[0])
        tokens = [top_mask["token_str"] for top_mask in unmasked]
        scores = [top_mask["score"] for top_mask in unmasked]

        unmasked_targets = unmasker(self.valid_inputs[0], targets=tokens)
        target_scores = [top_mask["score"] for top_mask in unmasked_targets]

        self.assertEqual(scores, target_scores)

    @require_torch
    def test_torch_fill_mask_with_targets_and_topk(self):
        model_name = self.small_models[0]
        unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
        targets = [" Teven", "臓Patrick", "臓Clara"]
        top_k = 2
        outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
        self.assertEqual(
            nested_simplify(outputs),
            [
                {"sequence": "My name is Patrick", "score": 0.0, "token": 3499, "token_str": " Patrick"},
                {"sequence": "My name is Te", "score": 0.0, "token": 2941, "token_str": " Te"},
            ],
        )

    @require_torch
    def test_torch_fill_mask_with_duplicate_targets_and_topk(self):
        model_name = self.small_models[0]
        unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
        # String duplicates + id duplicates
        targets = [" Teven", "臓Patrick", "臓Clara", "臓Clara", " Clara"]
        top_k = 10
        outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)

        # The target list contains duplicates, so we can't output more
        # than them
        self.assertEqual(len(outputs), 3)

135
136
137
    @require_tf
    def test_tf_fill_mask_with_targets(self):
        valid_inputs = ["My name is <mask>"]
138
        # ' Teven' will yield a warning but work as " Te"
139
        invalid_targets = [[], [""], ""]
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        unmasker = pipeline(
            task="fill-mask", model=self.small_models[0], tokenizer=self.small_models[0], framework="tf"
        )
        outputs = unmasker(valid_inputs, targets=[" Teven", "臓Patrick", "臓Clara"])
        self.assertEqual(
            nested_simplify(outputs),
            [
                {"sequence": "My name is Clara", "score": 0.0, "token": 13606, "token_str": " Clara"},
                {"sequence": "My name is Patrick", "score": 0.0, "token": 3499, "token_str": " Patrick"},
                {"sequence": "My name is Te", "score": 0.0, "token": 2941, "token_str": " Te"},
            ],
        )
        # topk
        outputs = unmasker(valid_inputs, targets=[" Teven", "臓Patrick", "臓Clara"], top_k=2)
        self.assertEqual(
            nested_simplify(outputs),
            [
                {"sequence": "My name is Clara", "score": 0.0, "token": 13606, "token_str": " Clara"},
                {"sequence": "My name is Patrick", "score": 0.0, "token": 3499, "token_str": " Patrick"},
            ],
        )
        for targets in invalid_targets:
            with self.assertRaises(ValueError):
                unmasker(valid_inputs, targets=targets)
164
165
166
167
168
169
170
171
172

    @require_torch
    @slow
    def test_torch_fill_mask_results(self):
        mandatory_keys = {"sequence", "score", "token"}
        valid_inputs = [
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
173
        valid_targets = ["臓Patrick", "臓Clara"]
174
        for model_name in self.large_models:
175
            unmasker = pipeline(
176
177
178
179
                task="fill-mask",
                model=model_name,
                tokenizer=model_name,
                framework="pt",
180
                top_k=2,
181
            )
Lysandre Debut's avatar
Lysandre Debut committed
182

183
            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
Lysandre Debut's avatar
Lysandre Debut committed
184
185
186
187
188
189
            self.assertIsInstance(mono_result, list)
            self.assertIsInstance(mono_result[0], dict)

            for mandatory_key in mandatory_keys:
                self.assertIn(mandatory_key, mono_result[0])

190
            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
Lysandre Debut's avatar
Lysandre Debut committed
191
192
193
194
            self.assertIsInstance(multi_result, list)
            self.assertIsInstance(multi_result[0], (dict, list))

            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_RESULT):
195
196
197
198
199
                for r, e in zip(result, expected):
                    self.assertEqual(r["sequence"], e["sequence"])
                    self.assertEqual(r["token_str"], e["token_str"])
                    self.assertEqual(r["token"], e["token"])
                    self.assertAlmostEqual(r["score"], e["score"], places=3)
Lysandre Debut's avatar
Lysandre Debut committed
200
201
202
203
204
205
206
207

            if isinstance(multi_result[0], list):
                multi_result = multi_result[0]

            for result in multi_result:
                for key in mandatory_keys:
                    self.assertIn(key, result)

208
            self.assertRaises(Exception, unmasker, [None])
Lysandre Debut's avatar
Lysandre Debut committed
209
210

            valid_inputs = valid_inputs[:1]
211
            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
Lysandre Debut's avatar
Lysandre Debut committed
212
213
214
215
216
217
            self.assertIsInstance(mono_result, list)
            self.assertIsInstance(mono_result[0], dict)

            for mandatory_key in mandatory_keys:
                self.assertIn(mandatory_key, mono_result[0])

218
            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
Lysandre Debut's avatar
Lysandre Debut committed
219
220
221
222
            self.assertIsInstance(multi_result, list)
            self.assertIsInstance(multi_result[0], (dict, list))

            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_TARGET_RESULT):
223
224
225
226
227
                for r, e in zip(result, expected):
                    self.assertEqual(r["sequence"], e["sequence"])
                    self.assertEqual(r["token_str"], e["token_str"])
                    self.assertEqual(r["token"], e["token"])
                    self.assertAlmostEqual(r["score"], e["score"], places=3)
Lysandre Debut's avatar
Lysandre Debut committed
228
229
230
231
232
233
234
235

            if isinstance(multi_result[0], list):
                multi_result = multi_result[0]

            for result in multi_result:
                for key in mandatory_keys:
                    self.assertIn(key, result)

236
            self.assertRaises(Exception, unmasker, [None])
237
238
239
240
241
242
243
244
245

    @require_tf
    @slow
    def test_tf_fill_mask_results(self):
        mandatory_keys = {"sequence", "score", "token"}
        valid_inputs = [
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
246
        valid_targets = ["臓Patrick", "臓Clara"]
247
        for model_name in self.large_models:
248
            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2)
Lysandre Debut's avatar
Lysandre Debut committed
249

250
            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
Lysandre Debut's avatar
Lysandre Debut committed
251
252
253
254
255
256
            self.assertIsInstance(mono_result, list)
            self.assertIsInstance(mono_result[0], dict)

            for mandatory_key in mandatory_keys:
                self.assertIn(mandatory_key, mono_result[0])

257
            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
Lysandre Debut's avatar
Lysandre Debut committed
258
259
260
261
            self.assertIsInstance(multi_result, list)
            self.assertIsInstance(multi_result[0], (dict, list))

            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_RESULT):
262
263
264
265
266
                for r, e in zip(result, expected):
                    self.assertEqual(r["sequence"], e["sequence"])
                    self.assertEqual(r["token_str"], e["token_str"])
                    self.assertEqual(r["token"], e["token"])
                    self.assertAlmostEqual(r["score"], e["score"], places=3)
Lysandre Debut's avatar
Lysandre Debut committed
267
268
269
270
271
272
273
274

            if isinstance(multi_result[0], list):
                multi_result = multi_result[0]

            for result in multi_result:
                for key in mandatory_keys:
                    self.assertIn(key, result)

275
            self.assertRaises(Exception, unmasker, [None])
Lysandre Debut's avatar
Lysandre Debut committed
276
277

            valid_inputs = valid_inputs[:1]
278
            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
Lysandre Debut's avatar
Lysandre Debut committed
279
280
281
282
283
284
            self.assertIsInstance(mono_result, list)
            self.assertIsInstance(mono_result[0], dict)

            for mandatory_key in mandatory_keys:
                self.assertIn(mandatory_key, mono_result[0])

285
            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
Lysandre Debut's avatar
Lysandre Debut committed
286
287
288
289
            self.assertIsInstance(multi_result, list)
            self.assertIsInstance(multi_result[0], (dict, list))

            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_TARGET_RESULT):
290
291
292
293
294
                for r, e in zip(result, expected):
                    self.assertEqual(r["sequence"], e["sequence"])
                    self.assertEqual(r["token_str"], e["token_str"])
                    self.assertEqual(r["token"], e["token"])
                    self.assertAlmostEqual(r["score"], e["score"], places=3)
Lysandre Debut's avatar
Lysandre Debut committed
295
296
297
298
299
300
301
302

            if isinstance(multi_result[0], list):
                multi_result = multi_result[0]

            for result in multi_result:
                for key in mandatory_keys:
                    self.assertIn(key, result)

303
            self.assertRaises(Exception, unmasker, [None])
304
305
306
307
308
309
310
311
312
313
314
315
316
317

    @require_tf
    @slow
    def test_tf_fill_mask_targets_equivalence(self):
        model_name = self.large_models[0]
        unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
        unmasked = unmasker(self.valid_inputs[0])
        tokens = [top_mask["token_str"] for top_mask in unmasked]
        scores = [top_mask["score"] for top_mask in unmasked]

        unmasked_targets = unmasker(self.valid_inputs[0], targets=tokens)
        target_scores = [top_mask["score"] for top_mask in unmasked_targets]

        self.assertEqual(scores, target_scores)