test_pipelines.py 41 KB
Newer Older
1
import unittest
Julien Chaumond's avatar
Julien Chaumond committed
2
from typing import Iterable, List, Optional
Morgan Funtowicz's avatar
Morgan Funtowicz committed
3

4
5
import pytest

Morgan Funtowicz's avatar
Morgan Funtowicz committed
6
from transformers import pipeline
7
from transformers.pipelines import SUPPORTED_TASKS, Conversation, DefaultArgumentHandler, Pipeline
8
from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow, torch_device
9

Aymeric Augustin's avatar
Aymeric Augustin committed
10

11
DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
12
13
VALID_INPUTS = ["A simple string", ["list of strings"]]

14
NER_FINETUNED_MODELS = ["sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"]
15
TF_NER_FINETUNED_MODELS = ["Narsil/small"]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
19
20
# xlnet-base-cased disabled for now, since it crashes TF2
FEATURE_EXTRACT_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased"]
TEXT_CLASSIF_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"]
TEXT_GENERATION_FINETUNED_MODELS = ["sshleifer/tiny-ctrl"]
21

22
23
FILL_MASK_FINETUNED_MODELS = ["sshleifer/tiny-distilroberta-base"]
LARGE_FILL_MASK_FINETUNED_MODELS = ["distilroberta-base"]  # @slow
Julien Chaumond's avatar
Julien Chaumond committed
24

25
SUMMARIZATION_FINETUNED_MODELS = ["sshleifer/bart-tiny-random", "patrickvonplaten/t5-tiny-random"]
26
TF_SUMMARIZATION_FINETUNED_MODELS = ["sshleifer/bart-tiny-random", "patrickvonplaten/t5-tiny-random"]
27

28
29
30
31
32
33
TRANSLATION_FINETUNED_MODELS = [
    ("patrickvonplaten/t5-tiny-random", "translation_en_to_de"),
    ("patrickvonplaten/t5-tiny-random", "translation_en_to_ro"),
]
TF_TRANSLATION_FINETUNED_MODELS = [("patrickvonplaten/t5-tiny-random", "translation_en_to_fr")]

34
35
36
TEXT2TEXT_FINETUNED_MODELS = ["patrickvonplaten/t5-tiny-random"]
TF_TEXT2TEXT_FINETUNED_MODELS = ["patrickvonplaten/t5-tiny-random"]

37
DIALOGUE_FINETUNED_MODELS = ["microsoft/DialoGPT-medium"]  # @slow
38

39
40
expected_fill_mask_result = [
    [
41
42
        {"sequence": "<s>My name is John</s>", "score": 0.00782308354973793, "token": 610, "token_str": "臓John"},
        {"sequence": "<s>My name is Chris</s>", "score": 0.007475061342120171, "token": 1573, "token_str": "臓Chris"},
43
44
    ],
    [
45
46
        {"sequence": "<s>The largest city in France is Paris</s>", "score": 0.3185044229030609, "token": 2201},
        {"sequence": "<s>The largest city in France is Lyon</s>", "score": 0.21112334728240967, "token": 12790},
47
48
    ],
]
49

50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
expected_fill_mask_target_result = [
    [
        {
            "sequence": "<s>My name is Patrick</s>",
            "score": 0.004992353264242411,
            "token": 3499,
            "token_str": "臓Patrick",
        },
        {
            "sequence": "<s>My name is Clara</s>",
            "score": 0.00019297805556561798,
            "token": 13606,
            "token_str": "臓Clara",
        },
    ]
]

67
SUMMARIZATION_KWARGS = dict(num_beams=2, min_length=2, max_length=5)
68

69

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class DefaultArgumentHandlerTestCase(unittest.TestCase):
    def setUp(self) -> None:
        self.handler = DefaultArgumentHandler()

    def test_kwargs_x(self):
        mono_data = {"X": "This is a sample input"}
        mono_args = self.handler(**mono_data)

        self.assertTrue(isinstance(mono_args, list))
        self.assertEqual(len(mono_args), 1)

        multi_data = {"x": ["This is a sample input", "This is a second sample input"]}
        multi_args = self.handler(**multi_data)

        self.assertTrue(isinstance(multi_args, list))
        self.assertEqual(len(multi_args), 2)

    def test_kwargs_data(self):
        mono_data = {"data": "This is a sample input"}
        mono_args = self.handler(**mono_data)

        self.assertTrue(isinstance(mono_args, list))
        self.assertEqual(len(mono_args), 1)

        multi_data = {"data": ["This is a sample input", "This is a second sample input"]}
        multi_args = self.handler(**multi_data)

        self.assertTrue(isinstance(multi_args, list))
        self.assertEqual(len(multi_args), 2)

    def test_multi_kwargs(self):
        mono_data = {"data": "This is a sample input", "X": "This is a sample input 2"}
        mono_args = self.handler(**mono_data)

        self.assertTrue(isinstance(mono_args, list))
        self.assertEqual(len(mono_args), 2)

        multi_data = {
            "data": ["This is a sample input", "This is a second sample input"],
            "test": ["This is a sample input 2", "This is a second sample input 2"],
        }
        multi_args = self.handler(**multi_data)

        self.assertTrue(isinstance(multi_args, list))
        self.assertEqual(len(multi_args), 4)

    def test_args(self):
        mono_data = "This is a sample input"
        mono_args = self.handler(mono_data)

        self.assertTrue(isinstance(mono_args, list))
        self.assertEqual(len(mono_args), 1)

        mono_data = ["This is a sample input"]
        mono_args = self.handler(mono_data)

        self.assertTrue(isinstance(mono_args, list))
        self.assertEqual(len(mono_args), 1)

        multi_data = ["This is a sample input", "This is a second sample input"]
        multi_args = self.handler(multi_data)

        self.assertTrue(isinstance(multi_args, list))
        self.assertEqual(len(multi_args), 2)

        multi_data = ["This is a sample input", "This is a second sample input"]
        multi_args = self.handler(*multi_data)

        self.assertTrue(isinstance(multi_args, list))
        self.assertEqual(len(multi_args), 2)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
142
class MonoColumnInputTestCase(unittest.TestCase):
Julien Chaumond's avatar
Julien Chaumond committed
143
144
145
146
147
    def _test_mono_column_pipeline(
        self,
        nlp: Pipeline,
        valid_inputs: List,
        output_keys: Iterable[str],
148
        invalid_inputs: List = [None],
Julien Chaumond's avatar
Julien Chaumond committed
149
150
        expected_multi_result: Optional[List] = None,
        expected_check_keys: Optional[List[str]] = None,
151
        **kwargs,
Julien Chaumond's avatar
Julien Chaumond committed
152
    ):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
153
154
        self.assertIsNotNone(nlp)

155
        mono_result = nlp(valid_inputs[0], **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
156
157
158
159
160
161
162
163
164
        self.assertIsInstance(mono_result, list)
        self.assertIsInstance(mono_result[0], (dict, list))

        if isinstance(mono_result[0], list):
            mono_result = mono_result[0]

        for key in output_keys:
            self.assertIn(key, mono_result[0])

165
        multi_result = [nlp(input, **kwargs) for input in valid_inputs]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
166
167
168
        self.assertIsInstance(multi_result, list)
        self.assertIsInstance(multi_result[0], (dict, list))

Julien Chaumond's avatar
Julien Chaumond committed
169
170
171
172
        if expected_multi_result is not None:
            for result, expect in zip(multi_result, expected_multi_result):
                for key in expected_check_keys or []:
                    self.assertEqual(
Lysandre's avatar
Lysandre committed
173
174
                        set([o[key] for o in result]),
                        set([o[key] for o in expect]),
Julien Chaumond's avatar
Julien Chaumond committed
175
176
                    )

Morgan Funtowicz's avatar
Morgan Funtowicz committed
177
178
179
180
181
182
183
184
185
        if isinstance(multi_result[0], list):
            multi_result = multi_result[0]

        for result in multi_result:
            for key in output_keys:
                self.assertIn(key, result)

        self.assertRaises(Exception, nlp, invalid_inputs)

186
    @require_torch
187
    def test_torch_sentiment_analysis(self):
Julien Chaumond's avatar
Julien Chaumond committed
188
        mandatory_keys = {"label", "score"}
189
190
        for model_name in TEXT_CLASSIF_FINETUNED_MODELS:
            nlp = pipeline(task="sentiment-analysis", model=model_name, tokenizer=model_name)
191
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
192

193
194
    @require_tf
    def test_tf_sentiment_analysis(self):
Julien Chaumond's avatar
Julien Chaumond committed
195
        mandatory_keys = {"label", "score"}
196
197
        for model_name in TEXT_CLASSIF_FINETUNED_MODELS:
            nlp = pipeline(task="sentiment-analysis", model=model_name, tokenizer=model_name, framework="tf")
198
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
199

200
    @require_torch
201
202
203
    def test_torch_feature_extraction(self):
        for model_name in FEATURE_EXTRACT_FINETUNED_MODELS:
            nlp = pipeline(task="feature-extraction", model=model_name, tokenizer=model_name)
204
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, {})
Morgan Funtowicz's avatar
Morgan Funtowicz committed
205

206
    @require_tf
Julien Chaumond's avatar
Julien Chaumond committed
207
    def test_tf_feature_extraction(self):
208
209
        for model_name in FEATURE_EXTRACT_FINETUNED_MODELS:
            nlp = pipeline(task="feature-extraction", model=model_name, tokenizer=model_name, framework="tf")
210
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, {})
Morgan Funtowicz's avatar
Morgan Funtowicz committed
211

Julien Chaumond's avatar
Julien Chaumond committed
212
    @require_torch
213
214
215
216
217
218
    def test_torch_fill_mask(self):
        mandatory_keys = {"sequence", "score", "token"}
        valid_inputs = [
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
219
220
221
222
        invalid_inputs = [
            "This is <mask> <mask>"  # More than 1 mask_token in the input is not supported
            "This is"  # No mask_token is not supported
        ]
223
        for model_name in FILL_MASK_FINETUNED_MODELS:
Lysandre's avatar
Lysandre committed
224
225
226
227
228
            nlp = pipeline(
                task="fill-mask",
                model=model_name,
                tokenizer=model_name,
                framework="pt",
229
                top_k=2,
Lysandre's avatar
Lysandre committed
230
            )
231
232
233
            self._test_mono_column_pipeline(
                nlp, valid_inputs, mandatory_keys, invalid_inputs, expected_check_keys=["sequence"]
            )
234
235
236

    @require_tf
    def test_tf_fill_mask(self):
Julien Chaumond's avatar
Julien Chaumond committed
237
238
239
240
241
        mandatory_keys = {"sequence", "score", "token"}
        valid_inputs = [
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
242
243
244
245
        invalid_inputs = [
            "This is <mask> <mask>"  # More than 1 mask_token in the input is not supported
            "This is"  # No mask_token is not supported
        ]
246
        for model_name in FILL_MASK_FINETUNED_MODELS:
Lysandre's avatar
Lysandre committed
247
248
249
250
251
            nlp = pipeline(
                task="fill-mask",
                model=model_name,
                tokenizer=model_name,
                framework="tf",
252
                top_k=2,
Lysandre's avatar
Lysandre committed
253
            )
254
255
256
            self._test_mono_column_pipeline(
                nlp, valid_inputs, mandatory_keys, invalid_inputs, expected_check_keys=["sequence"]
            )
257

258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
    @require_torch
    def test_torch_fill_mask_with_targets(self):
        valid_inputs = ["My name is <mask>"]
        valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
        invalid_targets = [[], [""], ""]
        for model_name in FILL_MASK_FINETUNED_MODELS:
            nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
            for targets in valid_targets:
                outputs = nlp(valid_inputs, targets=targets)
                self.assertIsInstance(outputs, list)
                self.assertEqual(len(outputs), len(targets))
            for targets in invalid_targets:
                self.assertRaises(ValueError, nlp, valid_inputs, targets=targets)

    @require_tf
    def test_tf_fill_mask_with_targets(self):
        valid_inputs = ["My name is <mask>"]
        valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
        invalid_targets = [[], [""], ""]
        for model_name in FILL_MASK_FINETUNED_MODELS:
            nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
            for targets in valid_targets:
                outputs = nlp(valid_inputs, targets=targets)
                self.assertIsInstance(outputs, list)
                self.assertEqual(len(outputs), len(targets))
            for targets in invalid_targets:
                self.assertRaises(ValueError, nlp, valid_inputs, targets=targets)

286
287
288
289
290
291
292
    @require_torch
    @slow
    def test_torch_fill_mask_results(self):
        mandatory_keys = {"sequence", "score", "token"}
        valid_inputs = [
            "My name is <mask>",
            "The largest city in France is <mask>",
Julien Chaumond's avatar
Julien Chaumond committed
293
        ]
294
        valid_targets = [" Patrick", " Clara"]
295
        for model_name in LARGE_FILL_MASK_FINETUNED_MODELS:
Lysandre's avatar
Lysandre committed
296
297
298
299
300
            nlp = pipeline(
                task="fill-mask",
                model=model_name,
                tokenizer=model_name,
                framework="pt",
301
                top_k=2,
Lysandre's avatar
Lysandre committed
302
            )
Julien Chaumond's avatar
Julien Chaumond committed
303
304
305
306
            self._test_mono_column_pipeline(
                nlp,
                valid_inputs,
                mandatory_keys,
307
                expected_multi_result=expected_fill_mask_result,
Julien Chaumond's avatar
Julien Chaumond committed
308
309
                expected_check_keys=["sequence"],
            )
310
311
312
313
314
315
316
317
            self._test_mono_column_pipeline(
                nlp,
                valid_inputs[:1],
                mandatory_keys,
                expected_multi_result=expected_fill_mask_target_result,
                expected_check_keys=["sequence"],
                targets=valid_targets,
            )
Julien Chaumond's avatar
Julien Chaumond committed
318
319

    @require_tf
320
321
    @slow
    def test_tf_fill_mask_results(self):
Julien Chaumond's avatar
Julien Chaumond committed
322
323
324
325
326
        mandatory_keys = {"sequence", "score", "token"}
        valid_inputs = [
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
327
        valid_targets = [" Patrick", " Clara"]
328
        for model_name in LARGE_FILL_MASK_FINETUNED_MODELS:
329
            nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2)
Julien Chaumond's avatar
Julien Chaumond committed
330
331
332
333
            self._test_mono_column_pipeline(
                nlp,
                valid_inputs,
                mandatory_keys,
334
                expected_multi_result=expected_fill_mask_result,
Julien Chaumond's avatar
Julien Chaumond committed
335
336
                expected_check_keys=["sequence"],
            )
337
338
339
340
341
342
343
344
            self._test_mono_column_pipeline(
                nlp,
                valid_inputs[:1],
                mandatory_keys,
                expected_multi_result=expected_fill_mask_target_result,
                expected_check_keys=["sequence"],
                targets=valid_targets,
            )
Julien Chaumond's avatar
Julien Chaumond committed
345

346
    @require_torch
347
    @require_tokenizers
348
    def test_torch_summarization(self):
349
350
        invalid_inputs = [4, "<mask>"]
        mandatory_keys = ["summary_text"]
351
352
        for model in SUMMARIZATION_FINETUNED_MODELS:
            nlp = pipeline(task="summarization", model=model, tokenizer=model)
353
354
355
            self._test_mono_column_pipeline(
                nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs, **SUMMARIZATION_KWARGS
            )
356

357
    @require_torch
358
    @slow
359
360
361
362
363
364
365
    def test_integration_torch_summarization(self):
        nlp = pipeline(task="summarization", device=DEFAULT_DEVICE_NUM)
        cnn_article = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
        expected_cnn_summary = " The Palestinian Authority becomes the 123rd member of the International Criminal Court . The move gives the court jurisdiction over alleged crimes in Palestinian territories . Israel and the United States opposed the Palestinians' efforts to join the court . Rights group Human Rights Watch welcomes the move, says governments seeking to penalize Palestine should end pressure ."
        result = nlp(cnn_article)
        self.assertEqual(result[0]["summary_text"], expected_cnn_summary)

366
    @require_tf
367
    @slow
368
369
370
    def test_tf_summarization(self):
        invalid_inputs = [4, "<mask>"]
        mandatory_keys = ["summary_text"]
371
        for model_name in TF_SUMMARIZATION_FINETUNED_MODELS:
Lysandre's avatar
Lysandre committed
372
373
374
375
376
377
            nlp = pipeline(
                task="summarization",
                model=model_name,
                tokenizer=model_name,
                framework="tf",
            )
378
379
380
            self._test_mono_column_pipeline(
                nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs, **SUMMARIZATION_KWARGS
            )
381
382

    @require_torch
383
    @require_tokenizers
384
    @slow
385
    def test_torch_translation(self):
386
387
        invalid_inputs = [4, "<mask>"]
        mandatory_keys = ["translation_text"]
388
389
        for model_name, task in TRANSLATION_FINETUNED_MODELS:
            nlp = pipeline(task=task, model=model_name, tokenizer=model_name)
390
            self._test_mono_column_pipeline(
Lysandre's avatar
Lysandre committed
391
392
393
394
                nlp,
                VALID_INPUTS,
                mandatory_keys,
                invalid_inputs,
395
            )
396

397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
    @require_torch
    @slow
    def test_default_translations(self):
        # We don't provide a default for this pair
        with self.assertRaises(ValueError):
            pipeline(task="translation_cn_to_ar")

        # but we do for this one
        pipeline(task="translation_en_to_de")

    @require_torch
    def test_translation_on_odd_language(self):
        model = TRANSLATION_FINETUNED_MODELS[0][0]
        pipeline(task="translation_cn_to_ar", model=model)

    @require_torch
    def test_translation_default_language_selection(self):
        model = TRANSLATION_FINETUNED_MODELS[0][0]
        with pytest.warns(UserWarning, match=r".*translation_en_to_de.*"):
            nlp = pipeline(task="translation", model=model)
        self.assertEqual(nlp.task, "translation_en_to_de")

    @require_torch
    def test_translation_with_no_language_no_model_fails(self):
        with self.assertRaises(ValueError):
            pipeline(task="translation")

424
    @require_tf
425
    @slow
426
427
428
    def test_tf_translation(self):
        invalid_inputs = [4, "<mask>"]
        mandatory_keys = ["translation_text"]
429
430
        for model, task in TF_TRANSLATION_FINETUNED_MODELS:
            nlp = pipeline(task=task, model=model, tokenizer=model, framework="tf")
431
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs)
432

433
    @require_torch
434
    @require_tokenizers
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
    def test_torch_text2text(self):
        invalid_inputs = [4, "<mask>"]
        mandatory_keys = ["generated_text"]
        for model_name in TEXT2TEXT_FINETUNED_MODELS:
            nlp = pipeline(task="text2text-generation", model=model_name, tokenizer=model_name)
            self._test_mono_column_pipeline(
                nlp,
                VALID_INPUTS,
                mandatory_keys,
                invalid_inputs,
            )

    @require_tf
    @slow
    def test_tf_text2text(self):
        invalid_inputs = [4, "<mask>"]
        mandatory_keys = ["generated_text"]
        for model in TEXT2TEXT_FINETUNED_MODELS:
            nlp = pipeline(task="text2text-generation", model=model, tokenizer=model, framework="tf")
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs)

456
    @require_torch
457
458
459
    def test_torch_text_generation(self):
        for model_name in TEXT_GENERATION_FINETUNED_MODELS:
            nlp = pipeline(task="text-generation", model=model_name, tokenizer=model_name, framework="pt")
460
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, {})
461
        self._test_mono_column_pipeline(nlp, VALID_INPUTS, {}, prefix="This is ")
462
463
464

    @require_tf
    def test_tf_text_generation(self):
465
466
        for model_name in TEXT_GENERATION_FINETUNED_MODELS:
            nlp = pipeline(task="text-generation", model=model_name, tokenizer=model_name, framework="tf")
467
            self._test_mono_column_pipeline(nlp, VALID_INPUTS, {})
468
        self._test_mono_column_pipeline(nlp, VALID_INPUTS, {}, prefix="This is ")
469

470
    @require_torch
471
    @slow
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
    def test_integration_torch_conversation(self):
        # When
        nlp = pipeline(task="conversational", device=DEFAULT_DEVICE_NUM)
        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
        conversation_2 = Conversation("What's the last book you have read?")
        # Then
        self.assertEqual(len(conversation_1.past_user_inputs), 0)
        self.assertEqual(len(conversation_2.past_user_inputs), 0)
        # When
        result = nlp([conversation_1, conversation_2], do_sample=False, max_length=1000)
        # Then
        self.assertEqual(result, [conversation_1, conversation_2])
        self.assertEqual(len(result[0].past_user_inputs), 1)
        self.assertEqual(len(result[1].past_user_inputs), 1)
        self.assertEqual(len(result[0].generated_responses), 1)
        self.assertEqual(len(result[1].generated_responses), 1)
        self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?")
        self.assertEqual(result[0].generated_responses[0], "The Big Lebowski")
        self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?")
        self.assertEqual(result[1].generated_responses[0], "The Last Question")
        # When
        conversation_2.add_user_input("Why do you recommend it?")
        result = nlp(conversation_2, do_sample=False, max_length=1000)
        # Then
        self.assertEqual(result, conversation_2)
        self.assertEqual(len(result.past_user_inputs), 2)
        self.assertEqual(len(result.generated_responses), 2)
        self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?")
        self.assertEqual(result.generated_responses[1], "It's a good book.")

    @require_torch
503
    @slow
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
    def test_integration_torch_conversation_truncated_history(self):
        # When
        nlp = pipeline(task="conversational", min_length_for_response=24, device=DEFAULT_DEVICE_NUM)
        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
        # Then
        self.assertEqual(len(conversation_1.past_user_inputs), 0)
        # When
        result = nlp(conversation_1, do_sample=False, max_length=36)
        # Then
        self.assertEqual(result, conversation_1)
        self.assertEqual(len(result.past_user_inputs), 1)
        self.assertEqual(len(result.generated_responses), 1)
        self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?")
        self.assertEqual(result.generated_responses[0], "The Big Lebowski")
        # When
        conversation_1.add_user_input("Is it an action movie?")
        result = nlp(conversation_1, do_sample=False, max_length=36)
        # Then
        self.assertEqual(result, conversation_1)
        self.assertEqual(len(result.past_user_inputs), 2)
        self.assertEqual(len(result.generated_responses), 2)
        self.assertEqual(result.past_user_inputs[1], "Is it an action movie?")
        self.assertEqual(result.generated_responses[1], "It's a comedy.")

528
529

QA_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased-distilled-squad"]
530

Morgan Funtowicz's avatar
Morgan Funtowicz committed
531

532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
class ZeroShotClassificationPipelineTests(unittest.TestCase):
    def _test_scores_sum_to_one(self, result):
        sum = 0.0
        for score in result["scores"]:
            sum += score
        self.assertAlmostEqual(sum, 1.0)

    def _test_zero_shot_pipeline(self, nlp):
        output_keys = {"sequence", "labels", "scores"}
        valid_mono_inputs = [
            {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
            {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
            {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
            {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
            {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
            {
                "sequences": "Who are you voting for in 2020?",
                "candidate_labels": "politics",
                "hypothesis_template": "This text is about {}",
            },
        ]
        valid_multi_input = {
            "sequences": ["Who are you voting for in 2020?", "What is the capital of Spain?"],
            "candidate_labels": "politics",
        }
        invalid_inputs = [
            {"sequences": None, "candidate_labels": "politics"},
            {"sequences": "", "candidate_labels": "politics"},
            {"sequences": "Who are you voting for in 2020?", "candidate_labels": None},
            {"sequences": "Who are you voting for in 2020?", "candidate_labels": ""},
            {
                "sequences": "Who are you voting for in 2020?",
                "candidate_labels": "politics",
                "hypothesis_template": None,
            },
            {
                "sequences": "Who are you voting for in 2020?",
                "candidate_labels": "politics",
                "hypothesis_template": "",
            },
            {
                "sequences": "Who are you voting for in 2020?",
                "candidate_labels": "politics",
                "hypothesis_template": "Template without formatting syntax.",
            },
        ]
        self.assertIsNotNone(nlp)

        for mono_input in valid_mono_inputs:
            mono_result = nlp(**mono_input)
            self.assertIsInstance(mono_result, dict)
            if len(mono_result["labels"]) > 1:
                self._test_scores_sum_to_one(mono_result)

            for key in output_keys:
                self.assertIn(key, mono_result)

        multi_result = nlp(**valid_multi_input)
        self.assertIsInstance(multi_result, list)
        self.assertIsInstance(multi_result[0], dict)
        self.assertEqual(len(multi_result), len(valid_multi_input["sequences"]))

        for result in multi_result:
            for key in output_keys:
                self.assertIn(key, result)

            if len(result["labels"]) > 1:
                self._test_scores_sum_to_one(result)

        for bad_input in invalid_inputs:
            self.assertRaises(Exception, nlp, **bad_input)

    def _test_zero_shot_pipeline_outputs(self, nlp):
        inputs = [
            {
                "sequences": "Who are you voting for in 2020?",
                "candidate_labels": ["politics", "public health", "science"],
            },
            {
                "sequences": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
                "candidate_labels": ["machine learning", "statistics", "translation", "vision"],
                "multi_class": True,
            },
        ]

        expected_outputs = [
            {
                "sequence": "Who are you voting for in 2020?",
                "labels": ["politics", "public health", "science"],
                "scores": [0.975, 0.015, 0.008],
            },
            {
                "sequence": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
                "labels": ["translation", "machine learning", "vision", "statistics"],
                "scores": [0.817, 0.712, 0.018, 0.017],
            },
        ]

        for input, expected_output in zip(inputs, expected_outputs):
            output = nlp(**input)
            for key in output:
                if key == "scores":
                    for output_score, expected_score in zip(output[key], expected_output[key]):
                        self.assertAlmostEqual(output_score, expected_score, places=2)
                else:
                    self.assertEqual(output[key], expected_output[key])

    @require_torch
    def test_torch_zero_shot_classification(self):
        for model_name in TEXT_CLASSIF_FINETUNED_MODELS:
            nlp = pipeline(task="zero-shot-classification", model=model_name, tokenizer=model_name)
            self._test_zero_shot_pipeline(nlp)

    @require_tf
    def test_tf_zero_shot_classification(self):
        for model_name in TEXT_CLASSIF_FINETUNED_MODELS:
            nlp = pipeline(task="zero-shot-classification", model=model_name, tokenizer=model_name, framework="tf")
            self._test_zero_shot_pipeline(nlp)

    @require_torch
652
    @slow
653
654
655
656
657
    def test_torch_zero_shot_outputs(self):
        nlp = pipeline(task="zero-shot-classification", model="roberta-large-mnli")
        self._test_zero_shot_pipeline_outputs(nlp)

    @require_tf
658
    @slow
659
660
661
662
663
    def test_tf_zero_shot_outputs(self):
        nlp = pipeline(task="zero-shot-classification", model="roberta-large-mnli", framework="tf")
        self._test_zero_shot_pipeline_outputs(nlp)


664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
class DialoguePipelineTests(unittest.TestCase):
    def _test_conversation_pipeline(self, nlp):
        valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
        invalid_inputs = ["Hi there!", Conversation()]
        self.assertIsNotNone(nlp)

        mono_result = nlp(valid_inputs[0])
        self.assertIsInstance(mono_result, Conversation)

        multi_result = nlp(valid_inputs[1])
        self.assertIsInstance(multi_result, list)
        self.assertIsInstance(multi_result[0], Conversation)
        # Inactive conversations passed to the pipeline raise a ValueError
        self.assertRaises(ValueError, nlp, valid_inputs[1])

        for bad_input in invalid_inputs:
            self.assertRaises(Exception, nlp, bad_input)
        self.assertRaises(Exception, nlp, invalid_inputs)

    @require_torch
684
    @slow
685
686
687
688
689
690
    def test_torch_conversation(self):
        for model_name in DIALOGUE_FINETUNED_MODELS:
            nlp = pipeline(task="conversational", model=model_name, tokenizer=model_name)
            self._test_conversation_pipeline(nlp)

    @require_tf
691
    @slow
692
693
694
695
696
697
    def test_tf_conversation(self):
        for model_name in DIALOGUE_FINETUNED_MODELS:
            nlp = pipeline(task="conversational", model=model_name, tokenizer=model_name, framework="tf")
            self._test_conversation_pipeline(nlp)


698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
class QAPipelineTests(unittest.TestCase):
    def _test_qa_pipeline(self, nlp):
        output_keys = {"score", "answer", "start", "end"}
        valid_inputs = [
            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
            {
                "question": "In what field is HuggingFace working ?",
                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
            },
        ]
        invalid_inputs = [
            {"question": "", "context": "This is a test to try empty question edge case"},
            {"question": None, "context": "This is a test to try empty question edge case"},
            {"question": "What is does with empty context ?", "context": ""},
            {"question": "What is does with empty context ?", "context": None},
        ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
        self.assertIsNotNone(nlp)

        mono_result = nlp(valid_inputs[0])
        self.assertIsInstance(mono_result, dict)

        for key in output_keys:
            self.assertIn(key, mono_result)

        multi_result = nlp(valid_inputs)
        self.assertIsInstance(multi_result, list)
        self.assertIsInstance(multi_result[0], dict)

        for result in multi_result:
            for key in output_keys:
                self.assertIn(key, result)
729
730
        for bad_input in invalid_inputs:
            self.assertRaises(Exception, nlp, bad_input)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
731
732
        self.assertRaises(Exception, nlp, invalid_inputs)

733
    @require_torch
734
735
736
737
    def test_torch_question_answering(self):
        for model_name in QA_FINETUNED_MODELS:
            nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name)
            self._test_qa_pipeline(nlp)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
738

739
740
    @require_tf
    def test_tf_question_answering(self):
741
742
743
        for model_name in QA_FINETUNED_MODELS:
            nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name, framework="tf")
            self._test_qa_pipeline(nlp)
Lysandre Debut's avatar
Lysandre Debut committed
744
745


746
747
class NerPipelineTests(unittest.TestCase):
    def _test_ner_pipeline(
Lysandre's avatar
Lysandre committed
748
749
750
        self,
        nlp: Pipeline,
        output_keys: Iterable[str],
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
    ):

        ungrouped_ner_inputs = [
            [
                {"entity": "B-PER", "index": 1, "score": 0.9994944930076599, "word": "Cons"},
                {"entity": "B-PER", "index": 2, "score": 0.8025449514389038, "word": "##uelo"},
                {"entity": "I-PER", "index": 3, "score": 0.9993102550506592, "word": "Ara"},
                {"entity": "I-PER", "index": 4, "score": 0.9993743896484375, "word": "##煤j"},
                {"entity": "I-PER", "index": 5, "score": 0.9992871880531311, "word": "##o"},
                {"entity": "I-PER", "index": 6, "score": 0.9993029236793518, "word": "No"},
                {"entity": "I-PER", "index": 7, "score": 0.9981776475906372, "word": "##guera"},
                {"entity": "B-PER", "index": 15, "score": 0.9998136162757874, "word": "Andr茅s"},
                {"entity": "I-PER", "index": 16, "score": 0.999740719795227, "word": "Pas"},
                {"entity": "I-PER", "index": 17, "score": 0.9997414350509644, "word": "##tran"},
                {"entity": "I-PER", "index": 18, "score": 0.9996136426925659, "word": "##a"},
                {"entity": "B-ORG", "index": 28, "score": 0.9989739060401917, "word": "Far"},
                {"entity": "I-ORG", "index": 29, "score": 0.7188422083854675, "word": "##c"},
            ],
            [
                {"entity": "I-PER", "index": 1, "score": 0.9968166351318359, "word": "En"},
                {"entity": "I-PER", "index": 2, "score": 0.9957635998725891, "word": "##zo"},
                {"entity": "I-ORG", "index": 7, "score": 0.9986497163772583, "word": "UN"},
            ],
        ]
        expected_grouped_ner_results = [
            [
                {"entity_group": "B-PER", "score": 0.9710702640669686, "word": "Consuelo Ara煤jo Noguera"},
                {"entity_group": "B-PER", "score": 0.9997273534536362, "word": "Andr茅s Pastrana"},
                {"entity_group": "B-ORG", "score": 0.8589080572128296, "word": "Farc"},
            ],
            [
                {"entity_group": "I-PER", "score": 0.9962901175022125, "word": "Enzo"},
                {"entity_group": "I-ORG", "score": 0.9986497163772583, "word": "UN"},
            ],
        ]

        self.assertIsNotNone(nlp)

        mono_result = nlp(VALID_INPUTS[0])
        self.assertIsInstance(mono_result, list)
        self.assertIsInstance(mono_result[0], (dict, list))

        if isinstance(mono_result[0], list):
            mono_result = mono_result[0]

        for key in output_keys:
            self.assertIn(key, mono_result[0])

        multi_result = [nlp(input) for input in VALID_INPUTS]
        self.assertIsInstance(multi_result, list)
        self.assertIsInstance(multi_result[0], (dict, list))

        if isinstance(multi_result[0], list):
            multi_result = multi_result[0]

        for result in multi_result:
            for key in output_keys:
                self.assertIn(key, result)

        for ungrouped_input, grouped_result in zip(ungrouped_ner_inputs, expected_grouped_ner_results):
            self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result)

    @require_torch
    def test_torch_ner(self):
        mandatory_keys = {"entity", "word", "score"}
        for model_name in NER_FINETUNED_MODELS:
            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name)
            self._test_ner_pipeline(nlp, mandatory_keys)

    @require_torch
    def test_ner_grouped(self):
        mandatory_keys = {"entity_group", "word", "score"}
        for model_name in NER_FINETUNED_MODELS:
            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True)
            self._test_ner_pipeline(nlp, mandatory_keys)

    @require_tf
    def test_tf_ner(self):
        mandatory_keys = {"entity", "word", "score"}
        for model_name in NER_FINETUNED_MODELS:
            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf")
            self._test_ner_pipeline(nlp, mandatory_keys)

    @require_tf
    def test_tf_ner_grouped(self):
        mandatory_keys = {"entity_group", "word", "score"}
        for model_name in NER_FINETUNED_MODELS:
            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True)
            self._test_ner_pipeline(nlp, mandatory_keys)

841
842
843
844
845
846
847
848
    @require_tf
    def test_tf_only_ner(self):
        mandatory_keys = {"entity", "word", "score"}
        for model_name in TF_NER_FINETUNED_MODELS:
            # We don't specificy framework='tf' but it gets detected automatically
            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name)
            self._test_ner_pipeline(nlp, mandatory_keys)

849

Lysandre Debut's avatar
Lysandre Debut committed
850
class PipelineCommonTests(unittest.TestCase):
851
    pipelines = SUPPORTED_TASKS.keys()
Lysandre Debut's avatar
Lysandre Debut committed
852
853

    @require_tf
854
    @slow
Lysandre Debut's avatar
Lysandre Debut committed
855
856
    def test_tf_defaults(self):
        # Test that pipelines can be correctly loaded without any argument
Patrick von Platen's avatar
Patrick von Platen committed
857
        for task in self.pipelines:
858
            with self.subTest(msg="Testing TF defaults with TF and {}".format(task)):
Patrick von Platen's avatar
Patrick von Platen committed
859
                pipeline(task, framework="tf")
860
                pipeline(task)
Lysandre Debut's avatar
Lysandre Debut committed
861
862

    @require_torch
863
    @slow
Lysandre Debut's avatar
Lysandre Debut committed
864
865
    def test_pt_defaults(self):
        # Test that pipelines can be correctly loaded without any argument
Patrick von Platen's avatar
Patrick von Platen committed
866
867
868
        for task in self.pipelines:
            with self.subTest(msg="Testing Torch defaults with PyTorch and {}".format(task)):
                pipeline(task, framework="pt")
869
                pipeline(task)