pipelines.py 79.5 KB
Newer Older
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
import csv
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import logging
Morgan Funtowicz's avatar
Morgan Funtowicz committed
20
import os
21
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
22
import sys
Morgan Funtowicz's avatar
Morgan Funtowicz committed
23
from abc import ABC, abstractmethod
24
from contextlib import contextmanager
25
from itertools import chain
26
from os.path import abspath, exists
27
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
Morgan Funtowicz's avatar
Morgan Funtowicz committed
28
29
30

import numpy as np

31
from .configuration_auto import AutoConfig
32
33
34
35
36
37
38
from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
Morgan Funtowicz's avatar
Morgan Funtowicz committed
39

Aymeric Augustin's avatar
Aymeric Augustin committed
40

Morgan Funtowicz's avatar
Morgan Funtowicz committed
41
if is_tf_available():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
42
    import tensorflow as tf
43
    from .modeling_tf_auto import (
44
45
46
47
        TFAutoModel,
        TFAutoModelForSequenceClassification,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
48
        TFAutoModelWithLMHead,
49
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
50
51
52

if is_torch_available():
    import torch
53
    from .modeling_auto import (
54
55
56
57
        AutoModel,
        AutoModelForSequenceClassification,
        AutoModelForQuestionAnswering,
        AutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
58
        AutoModelWithLMHead,
59
        AutoModelForSeq2SeqLM,
60
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
61

62
63
64
65
if TYPE_CHECKING:
    from .modeling_utils import PreTrainedModel
    from .modeling_tf_utils import TFPreTrainedModel

Morgan Funtowicz's avatar
Morgan Funtowicz committed
66

67
68
logger = logging.getLogger(__name__)

69

thomwolf's avatar
thomwolf committed
70
def get_framework(model=None):
71
    """ Select framework (TensorFlow/PyTorch) to use.
72
        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
73
    """
thomwolf's avatar
thomwolf committed
74
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
Julien Chaumond's avatar
Julien Chaumond committed
75
        # Both framework are available but the user supplied a model class instance.
thomwolf's avatar
thomwolf committed
76
        # Try to guess which framework to use from the model classname
77
        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
78
    elif not is_tf_available() and not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
79
        raise RuntimeError(
80
81
82
83
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
84
    else:
85
        # framework = 'tf' if is_tf_available() else 'pt'
86
        framework = "pt" if is_torch_available() else "tf"
thomwolf's avatar
thomwolf committed
87
88
    return framework

89

90
91
92
93
94
95
96
97
98
99
100
101
class PipelineException(Exception):
    """
    Raised by pipelines when handling __call__
    """

    def __init__(self, task: str, model: str, reason: str):
        super().__init__(reason)

        self.task = task
        self.model = model


102
103
104
105
class ArgumentHandler(ABC):
    """
    Base interface for handling varargs for each Pipeline
    """
106

107
108
109
    @abstractmethod
    def __call__(self, *args, **kwargs):
        raise NotImplementedError()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
110
111


112
113
114
115
class DefaultArgumentHandler(ArgumentHandler):
    """
    Default varargs argument parser handling parameters for each Pipeline
    """
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    @staticmethod
    def handle_kwargs(kwargs: Dict) -> List:
        if len(kwargs) == 1:
            output = list(kwargs.values())
        else:
            output = list(chain(kwargs.values()))

        return DefaultArgumentHandler.handle_args(output)

    @staticmethod
    def handle_args(args: Sequence[Any]) -> List[str]:

        # Only one argument, let's do case by case
        if len(args) == 1:
            if isinstance(args[0], str):
132
                return [args[0]]
133
134
135
136
137
138
            elif not isinstance(args[0], list):
                return list(args)
            else:
                return args[0]

        # Multiple arguments (x1, x2, ...)
139
        elif len(args) > 1:
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
            if all([isinstance(arg, str) for arg in args]):
                return list(args)

            # If not instance of list, then it should instance of iterable
            elif isinstance(args, Iterable):
                return list(chain.from_iterable(chain(args)))
            else:
                raise ValueError(
                    "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args))
                )
        else:
            return []

    def __call__(self, *args, **kwargs):
        if len(kwargs) > 0 and len(args) > 0:
            raise ValueError("Pipeline cannot handle mixed args and kwargs")

        if len(kwargs) > 0:
            return DefaultArgumentHandler.handle_kwargs(kwargs)
        else:
            return DefaultArgumentHandler.handle_args(args)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
161
162


163
class PipelineDataFormat:
164
165
166
167
168
    """
    Base class for all the pipeline supported data format both for reading and writing.
    Supported data formats currently includes:
     - JSON
     - CSV
thomwolf's avatar
thomwolf committed
169
     - stdin/stdout (pipe)
170
171
172
173

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
174
175

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
176

177
178
179
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
180
181
        self.output_path = output_path
        self.input_path = input_path
182
        self.column = column.split(",") if column is not None else [""]
183
184
185
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
186
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
187

thomwolf's avatar
thomwolf committed
188
        if output_path is not None and not overwrite:
thomwolf's avatar
thomwolf committed
189
            if exists(abspath(self.output_path)):
190
                raise OSError("{} already exists on disk".format(self.output_path))
191

thomwolf's avatar
thomwolf committed
192
193
        if input_path is not None:
            if not exists(abspath(self.input_path)):
194
                raise OSError("{} doesnt exist on disk".format(self.input_path))
195
196
197
198
199
200
201

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: dict):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
202
203
204
205
206
        """
        Save the provided data object with the representation for the current `DataFormat`.
        :param data: data to store
        :return:
        """
207
208
        raise NotImplementedError()

209
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
210
211
212
213
214
        """
        Save the provided data object as a pickle-formatted binary data on the disk.
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
thomwolf's avatar
thomwolf committed
215
        path, _ = os.path.splitext(self.output_path)
216
        binary_path = os.path.extsep.join((path, "pickle"))
217

218
        with open(binary_path, "wb+") as f_output:
219
220
221
222
            pickle.dump(data, f_output)

        return binary_path

223
    @staticmethod
224
    def from_str(
225
        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
226
227
    ):
        if format == "json":
thomwolf's avatar
thomwolf committed
228
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
229
        elif format == "csv":
thomwolf's avatar
thomwolf committed
230
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
231
        elif format == "pipe":
thomwolf's avatar
thomwolf committed
232
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
233
        else:
234
            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
235
236
237


class CsvPipelineDataFormat(PipelineDataFormat):
238
239
240
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
241
        super().__init__(output_path, input_path, column, overwrite=overwrite)
242
243

    def __iter__(self):
244
        with open(self.input_path, "r") as f:
245
246
247
248
249
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}
                else:
250
                    yield row[self.column[0]]
251
252

    def save(self, data: List[dict]):
253
        with open(self.output_path, "w") as f:
254
255
256
257
258
259
260
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
                writer.writerows(data)


class JsonPipelineDataFormat(PipelineDataFormat):
261
262
263
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
264
        super().__init__(output_path, input_path, column, overwrite=overwrite)
265

266
        with open(input_path, "r") as f:
267
268
269
270
271
272
273
            self._entries = json.load(f)

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}
            else:
274
                yield entry[self.column[0]]
275
276

    def save(self, data: dict):
277
        with open(self.output_path, "w") as f:
278
279
280
            json.dump(data, f)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
281
282
283
284
285
286
287
class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process.
    For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
288

Morgan Funtowicz's avatar
Morgan Funtowicz committed
289
290
291
    def __iter__(self):
        for line in sys.stdin:
            # Split for multi-columns
292
            if "\t" in line:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
293

294
                line = line.split("\t")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
295
296
297
298
299
300
301
302
303
304
305
306
307
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        print(data)

308
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
thomwolf's avatar
thomwolf committed
309
        if self.output_path is None:
310
            raise KeyError(
311
312
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
313
314
315
316
            )

        return super().save_binary(data)

Morgan Funtowicz's avatar
Morgan Funtowicz committed
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        raise NotImplementedError()


332
class Pipeline(_ScikitCompat):
333
    """
Lysandre Debut's avatar
Lysandre Debut committed
334
335
336
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.

337
338
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
339

340
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Morgan Funtowicz's avatar
Morgan Funtowicz committed
341
342
343
344
345
346
347
348
349
350

    Pipeline supports running on CPU or GPU through the device argument. Users can specify
    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.

    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
    provide the binary_output constructor argument. If set to True, the output will be stored in the
    pickle format.

    Arguments:
351
352
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
353
354
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
355
356
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
357
358
359
360
361
362
363
364
365
366
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
367
            Reference to the object in charge of parsing supplied pipeline parameters.
Lysandre Debut's avatar
Lysandre Debut committed
368
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
369
370
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Lysandre Debut's avatar
Lysandre Debut committed
371
        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
372
373
374
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.

    Return:
Lysandre Debut's avatar
Lysandre Debut committed
375
        :obj:`List` or :obj:`Dict`:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
376
        Pipeline returns list or dictionary depending on:
Lysandre Debut's avatar
Lysandre Debut committed
377
378
379

         - Whether the user supplied multiple samples
         - Whether the pipeline exposes multiple fields in the output object
380
    """
thomwolf's avatar
thomwolf committed
381
382
383

    default_input_names = None

384
385
    def __init__(
        self,
386
387
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
388
        modelcard: Optional[ModelCard] = None,
389
        framework: Optional[str] = None,
390
        task: str = "",
391
392
393
394
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
    ):
395

thomwolf's avatar
thomwolf committed
396
397
398
        if framework is None:
            framework = get_framework()

399
400
        self.model = model
        self.tokenizer = tokenizer
401
        self.modelcard = modelcard
thomwolf's avatar
thomwolf committed
402
        self.framework = framework
403
        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
404
        self.binary_output = binary_output
405
406
        self._args_parser = args_parser or DefaultArgumentHandler()

407
        # Special handling
408
409
        if self.framework == "pt" and self.device.type == "cuda":
            self.model = self.model.to(self.device)
410

411
412
413
414
415
        # Update config with task specific parameters
        task_specific_params = self.model.config.task_specific_params
        if task_specific_params is not None and task in task_specific_params:
            self.model.config.update(task_specific_params.get(task))

416
    def save_pretrained(self, save_directory):
417
418
419
        """
        Save the pipeline's model and tokenizer to the specified save_directory
        """
420
421
        if os.path.isfile(save_directory):
            logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
422
            return
423
        os.makedirs(save_directory, exist_ok=True)
424
425
426

        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)
427
428
        if self.modelcard is not None:
            self.modelcard.save_pretrained(save_directory)
429
430

    def transform(self, X):
431
432
433
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
434
435
436
        return self(X=X)

    def predict(self, X):
437
438
439
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
440
        return self(X=X)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
441

442
443
    @contextmanager
    def device_placement(self):
444
445
446
447
448
449
450
451
452
453
454
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
        example:
            # Explicitly ask for tensor allocation on CUDA device :0
            nlp = pipeline(..., device=0)
            with nlp.device_placement():
                # Every framework specific tensor allocation will be done on the request device
                output = nlp(...)
        Returns:
            Context manager
        """
455
456
        if self.framework == "tf":
            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
457
458
                yield
        else:
459
            if self.device.type == "cuda":
460
                torch.cuda.set_device(self.device)
461

462
            yield
463

464
465
466
467
468
469
470
471
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.
        :param inputs:
        :return:
        """
        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

472
    def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
473
474
475
        """
        Parse arguments and tokenize
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
476
        # Parse arguments
477
        inputs = self._args_parser(*args, **kwargs)
478
479
        inputs = self.tokenizer(
            inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
480
        )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
481

Julien Chaumond's avatar
Julien Chaumond committed
482
483
        return inputs

484
485
    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
486
        return self._forward(inputs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
487

Julien Chaumond's avatar
Julien Chaumond committed
488
    def _forward(self, inputs, return_tensors=False):
489
490
491
492
        """
        Internal framework specific forward dispatching.
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
Julien Chaumond's avatar
Julien Chaumond committed
493
            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
494
495
496
        Returns:
            Numpy array
        """
497
498
499
500
        # Encode for forward
        with self.device_placement():
            if self.framework == "tf":
                # TODO trace model
Funtowicz Morgan's avatar
Funtowicz Morgan committed
501
                predictions = self.model(inputs.data, training=False)[0]
502
503
504
505
            else:
                with torch.no_grad():
                    inputs = self.ensure_tensor_on_device(**inputs)
                    predictions = self.model(**inputs)[0].cpu()
506

Julien Chaumond's avatar
Julien Chaumond committed
507
508
509
510
        if return_tensors:
            return predictions
        else:
            return predictions.numpy()
511
512
513


class FeatureExtractionPipeline(Pipeline):
514
    """
Lysandre Debut's avatar
Lysandre Debut committed
515
    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
516
    which can be used as features in downstream tasks.
Lysandre Debut's avatar
Lysandre Debut committed
517
518
519
520
521
522
523
524
525
526

    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "feature-extraction", for extracting features of a sequence.

    All models may be used for this pipeline. See a list of all models, including community-contributed models on
    `huggingface.co/models <https://huggingface.co/models>`__.

    Arguments:
527
528
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
529
530
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
531
532
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
533
534
535
536
537
538
539
540
541
542
543
544
545
546
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
547
    """
548

549
550
    def __init__(
        self,
551
552
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
553
        modelcard: Optional[ModelCard] = None,
554
555
556
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
557
        task: str = "",
558
559
560
561
562
563
564
565
566
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
567
            task=task,
568
        )
569

570
571
    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs).tolist()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
572
573


574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
class TextGenerationPipeline(Pipeline):
    """
    Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt.

    This language generation pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "text-generation", for generating text from a specified prompt.

    The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective,
    which includes the uni-directional models in the library (e.g. gpt2).
    See the list of available community models on
    `huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
    """

    # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
    # in https://github.com/rusiaaman/XLNet-gen#methodology
    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
592

593
594
595
596
597
598
599
600
601
    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
    (except for Alexei and Maria) are discovered.
    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
    remainder of the story. 1883 Western Siberia,
    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
    Rasputin has a vision and denounces one of the men as a horse thief. Although his
    father initially slaps him for making such an accusation, Rasputin watches as the
    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
602
    with people, even a bishop, begging for his blessing. """
603

604
605
606
607
608
609
610
611
612
613
614
615
616
617
    ALLOWED_MODELS = [
        "XLNetLMHeadModel",
        "TransfoXLLMHeadModel",
        "ReformerModelWithLMHead",
        "GPT2LMHeadModel",
        "OpenAIGPTLMHeadModel",
        "CTRLLMHeadModel",
        "TFXLNetLMHeadModel",
        "TFTransfoXLLMHeadModel",
        "TFGPT2LMHeadModel",
        "TFOpenAIGPTLMHeadModel",
        "TFCTRLLMHeadModel",
    ]

618
    def __call__(
619
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
620
    ):
621
622
623
624
625
626
627
        if self.model.__class__.__name__ not in self.ALLOWED_MODELS:
            raise NotImplementedError(
                "Generation is currently not supported for {}. Please select a model from {} for generation.".format(
                    self.model.__class__.__name__, self.ALLOWED_MODELS
                )
            )

628
        text_inputs = self._args_parser(*args)
629
630
631
632
633
634

        results = []
        for prompt_text in text_inputs:
            # Manage correct placement of the tensors
            with self.device_placement():
                if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]:
635
636
637
638
639
640
641
642
643
644
                    # For XLNet and TransformerXL we had an article to the prompt to give more state to the model.
                    padding_text = self.PADDING_TEXT + self.tokenizer.eos_token
                    padding = self._parse_and_tokenize(padding_text, padding=False, add_special_tokens=False)
                    # This impacts max_length and min_length argument that need adjusting.
                    padding_length = padding["input_ids"].shape[-1]
                    if "max_length" in generate_kwargs and generate_kwargs["max_length"] is not None:
                        generate_kwargs["max_length"] += padding_length
                    if "min_length" in generate_kwargs and generate_kwargs["min_length"] is not None:
                        generate_kwargs["min_length"] += padding_length

645
                    inputs = self._parse_and_tokenize(
646
                        padding_text + prompt_text, padding=False, add_special_tokens=False
647
                    )
648
                else:
649
                    inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False)
650

651
652
653
654
655
656
                # set input_ids to None to allow empty prompt
                if inputs["input_ids"].shape[-1] == 0:
                    inputs["input_ids"] = None
                    inputs["attention_mask"] = None

                if self.framework == "pt" and inputs["input_ids"] is not None:
657
658
659
660
661
662
                    inputs = self.ensure_tensor_on_device(**inputs)

                input_ids = inputs["input_ids"]

                # Ensure that batch size = 1 (batch generation not allowed for now)
                assert (
663
                    input_ids is None or input_ids.shape[0] == 1
664
665
666
667
668
669
                ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information."

                output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs)  # BS x SL

            result = []
            for generated_sequence in output_sequences:
670
                generated_sequence = generated_sequence.numpy().tolist()
671
672
673
674
675
676
677
678
679
680
681
682
                record = {}
                if return_tensors:
                    record["generated_token_ids"] = generated_sequence
                if return_text:
                    # Decode text
                    text = self.tokenizer.decode(
                        generated_sequence,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )

                    # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
683
684
685
686
687
688
689
690
691
692
693
694
                    if input_ids is None:
                        prompt_length = 0
                    else:
                        prompt_length = len(
                            self.tokenizer.decode(
                                input_ids[0],
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                            )
                        )

                    record["generated_text"] = prompt_text + text[prompt_length:]
695
696
697
698
699
700
701
702
703
704

                result.append(record)
            results += [result]

        if len(results) == 1:
            return results[0]

        return results


Morgan Funtowicz's avatar
Morgan Funtowicz committed
705
class TextClassificationPipeline(Pipeline):
706
    """
Lysandre Debut's avatar
Lysandre Debut committed
707
708
709
710
711
712
713
714
715
    Text classification pipeline using ModelForSequenceClassification head. See the
    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.

    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
716
717
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
718
719

    Arguments:
720
721
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
722
723
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
724
725
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
726
727
728
729
730
731
732
733
734
735
736
737
738
739
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
740
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
741

742
743
744
745
746
    def __init__(self, return_all_scores: bool = False, **kwargs):
        super().__init__(**kwargs)

        self.return_all_scores = return_all_scores

747
    def __call__(self, *args, **kwargs):
748
        outputs = super().__call__(*args, **kwargs)
Zhiyu Lin's avatar
Zhiyu Lin committed
749
        scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
750
751
        if self.return_all_scores:
            return [
752
                [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
753
754
755
756
757
758
                for item in scores
            ]
        else:
            return [
                {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
            ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
759
760


Julien Chaumond's avatar
Julien Chaumond committed
761
762
class FillMaskPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
763
764
765
766
767
768
769
770
771
772
    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.

    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "fill-mask", for predicting masked tokens in a sequence.

    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
    which includes the bi-directional models in the library.
773
774
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=lm-head>`__.
Lysandre Debut's avatar
Lysandre Debut committed
775
776

    Arguments:
777
778
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
779
780
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
781
782
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
783
784
785
786
787
788
789
790
791
792
793
794
795
796
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Julien Chaumond's avatar
Julien Chaumond committed
797
798
799
800
    """

    def __init__(
        self,
801
802
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
803
        modelcard: Optional[ModelCard] = None,
Julien Chaumond's avatar
Julien Chaumond committed
804
805
806
807
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        topk=5,
808
        task: str = "",
Julien Chaumond's avatar
Julien Chaumond committed
809
810
811
812
813
814
815
816
817
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
818
            task=task,
Julien Chaumond's avatar
Julien Chaumond committed
819
820
821
822
        )

        self.topk = topk

823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
    def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):
        numel = np.prod(masked_index.shape)
        if numel > 1:
            raise PipelineException(
                "fill-mask",
                self.model.base_model_prefix,
                f"More than one mask_token ({self.tokenizer.mask_token}) is not supported",
            )
        elif numel < 1:
            raise PipelineException(
                "fill-mask",
                self.model.base_model_prefix,
                f"No mask_token ({self.tokenizer.mask_token}) found on the input",
            )

Julien Chaumond's avatar
Julien Chaumond committed
838
839
840
841
842
843
844
845
846
847
848
849
    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
        outputs = self._forward(inputs, return_tensors=True)

        results = []
        batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)

        for i in range(batch_size):
            input_ids = inputs["input_ids"][i]
            result = []

            if self.framework == "tf":
850
851
852
853
854
855
                masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()

                # Fill mask pipeline supports only one ${mask_token} per sample
                self.ensure_exactly_one_mask_token(masked_index)

                logits = outputs[i, masked_index.item(), :]
Julien Chaumond's avatar
Julien Chaumond committed
856
857
858
859
                probs = tf.nn.softmax(logits)
                topk = tf.math.top_k(probs, k=self.topk)
                values, predictions = topk.values.numpy(), topk.indices.numpy()
            else:
860
861
862
863
                masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero()

                # Fill mask pipeline supports only one ${mask_token} per sample
                self.ensure_exactly_one_mask_token(masked_index.numpy())
864

865
                logits = outputs[i, masked_index.item(), :]
Julien Chaumond's avatar
Julien Chaumond committed
866
867
868
869
870
871
872
873
                probs = logits.softmax(dim=0)
                values, predictions = probs.topk(self.topk)

            for v, p in zip(values.tolist(), predictions.tolist()):
                tokens = input_ids.numpy()
                tokens[masked_index] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
874
875
876
877
878
879
880
881
                result.append(
                    {
                        "sequence": self.tokenizer.decode(tokens),
                        "score": v,
                        "token": p,
                        "token_str": self.tokenizer.convert_ids_to_tokens(p),
                    }
                )
Julien Chaumond's avatar
Julien Chaumond committed
882
883
884
885
886
887
888
889
890

            # Append
            results += [result]

        if len(results) == 1:
            return results[0]
        return results


891
class TokenClassificationPipeline(Pipeline):
892
    """
Lysandre Debut's avatar
Lysandre Debut committed
893
894
895
896
897
898
899
900
901
    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.

    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.

    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
902
903
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
904
905

    Arguments:
906
907
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
908
909
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
910
911
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
912
913
914
915
916
917
918
919
920
921
922
923
924
925
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
926
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
927

928
929
930
931
    default_input_names = "sequences"

    def __init__(
        self,
932
933
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
934
        modelcard: Optional[ModelCard] = None,
935
936
937
938
939
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
940
        task: str = "",
941
        grouped_entities: bool = False,
942
943
944
945
946
947
948
949
950
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
951
            task=task,
952
        )
953
954

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
955
        self.ignore_labels = ignore_labels
956
        self.grouped_entities = grouped_entities
957

958
959
    def __call__(self, *args, **kwargs):
        inputs = self._args_parser(*args, **kwargs)
Julien Chaumond's avatar
Julien Chaumond committed
960
        answers = []
961
        for sentence in inputs:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
962

963
964
            # Manage correct placement of the tensors
            with self.device_placement():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
965

966
967
                tokens = self.tokenizer(
                    sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
968
                )
969
970

                # Forward
971
                if self.framework == "tf":
Funtowicz Morgan's avatar
Funtowicz Morgan committed
972
                    entities = self.model(tokens.data)[0][0].numpy()
973
                    input_ids = tokens["input_ids"].numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
974
                else:
975
                    with torch.no_grad():
976
                        tokens = self.ensure_tensor_on_device(**tokens)
977
                        entities = self.model(**tokens)[0][0].cpu().numpy()
978
                        input_ids = tokens["input_ids"].cpu().numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
979

thomwolf's avatar
thomwolf committed
980
981
            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
982

983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
            entities = []
            entity_groups = []
            entity_group_disagg = []
            # Filter to labels not in `self.ignore_labels`
            filtered_labels_idx = [
                (idx, label_idx)
                for idx, label_idx in enumerate(labels_idx)
                if self.model.config.id2label[label_idx] not in self.ignore_labels
            ]

            for idx, label_idx in filtered_labels_idx:

                entity = {
                    "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
                    "score": score[idx][label_idx].item(),
                    "entity": self.model.config.id2label[label_idx],
                    "index": idx,
                }
                last_idx, _ = filtered_labels_idx[-1]
                if self.grouped_entities:
                    if not entity_group_disagg:
                        entity_group_disagg += [entity]
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                        continue

                    # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
                    if (
                        entity["entity"] == entity_group_disagg[-1]["entity"]
                        and entity["index"] == entity_group_disagg[-1]["index"] + 1
                    ):
                        entity_group_disagg += [entity]
                        # Group the entities at the last entity
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                    # If the current entity is different from the previous entity, aggregate the disaggregated entity group
                    else:
                        entity_groups += [self.group_entities(entity_group_disagg)]
                        entity_group_disagg = [entity]

                entities += [entity]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1024

1025
1026
1027
1028
            # Ensure if an entity is the latest one in the sequence it gets appended to the output
            if len(entity_group_disagg) > 0:
                entity_groups.append(self.group_entities(entity_group_disagg))

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1029
            # Append
1030
1031
1032
1033
1034
            if self.grouped_entities:
                answers += [entity_groups]
            else:
                answers += [entities]

thomwolf's avatar
thomwolf committed
1035
1036
        if len(answers) == 1:
            return answers[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1037
1038
        return answers

1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
    def group_entities(self, entities):
        """
        Returns grouped entities
        """
        # Get the last entity in the entity group
        entity = entities[-1]["entity"]
        scores = np.mean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
        }
        return entity_group

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1055

1056
NerPipeline = TokenClassificationPipeline
1057
1058


1059
1060
1061
1062
1063
1064
1065
1066
class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
    to internal SquadExample / SquadFeature structures.

    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
    arguments.
    """
1067

1068
1069
1070
1071
    def __call__(self, *args, **kwargs):
        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
        if args is not None and len(args) > 0:
            if len(args) == 1:
1072
                kwargs["X"] = args[0]
1073
            else:
1074
                kwargs["X"] = list(args)
1075

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1076
1077
        # Generic compatibility with sklearn and Keras
        # Batched data
1078
1079
        if "X" in kwargs or "data" in kwargs:
            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
1080

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1081
1082
1083
1084
1085
            if isinstance(inputs, dict):
                inputs = [inputs]
            else:
                # Copy to avoid overriding arguments
                inputs = [i for i in inputs]
1086

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1087
            for i, item in enumerate(inputs):
1088
                if isinstance(item, dict):
1089
1090
                    if any(k not in item for k in ["question", "context"]):
                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
1091

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1092
1093
1094
                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)

                elif not isinstance(item, SquadExample):
1095
                    raise ValueError(
1096
1097
1098
                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
                            "X" if "X" in kwargs else "data"
                        )
1099
1100
1101
                    )

            # Tabular input
1102
1103
1104
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], str):
                kwargs["question"] = [kwargs["question"]]
1105

1106
1107
            if isinstance(kwargs["context"], str):
                kwargs["context"] = [kwargs["context"]]
1108

1109
1110
1111
            inputs = [
                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
            ]
1112
        else:
1113
            raise ValueError("Unknown arguments {}".format(kwargs))
1114
1115
1116
1117
1118
1119
1120

        if not isinstance(inputs, list):
            inputs = [inputs]

        return inputs


Morgan Funtowicz's avatar
Morgan Funtowicz committed
1121
1122
class QuestionAnsweringPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
1123
1124
1125
1126
1127
1128
1129
1130
1131
    Question Answering pipeline using ModelForQuestionAnswering head. See the
    `question answering usage <../usage.html#question-answering>`__ examples for more information.

    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "question-answering", for answering questions given a context.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
1132
1133
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
Lysandre Debut's avatar
Lysandre Debut committed
1134
1135

    Arguments:
1136
1137
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1138
1139
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
1140
1141
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1156
1157
    """

1158
1159
1160
1161
    default_input_names = "question,context"

    def __init__(
        self,
1162
1163
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
1164
        modelcard: Optional[ModelCard] = None,
1165
1166
        framework: Optional[str] = None,
        device: int = -1,
1167
        task: str = "",
1168
1169
1170
1171
1172
1173
1174
1175
1176
        **kwargs
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=QuestionAnsweringArgumentHandler(),
            device=device,
1177
            task=task,
1178
            **kwargs,
1179
        )
thomwolf's avatar
thomwolf committed
1180

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1181
    @staticmethod
1182
1183
1184
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
1185
1186
1187
1188
        """
        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
        We currently support extractive question answering.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1189
        Arguments:
1190
1191
             question: (str, List[str]) The question to be ask for the associated context
             context: (str, List[str]) The context in which we will look for the answer.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1192
1193
1194

        Returns:
            SquadExample initialized with the corresponding question and context.
1195
1196
        """
        if isinstance(question, list):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1197
1198
1199
1200
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)

1201
    def __call__(self, *args, **kwargs):
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the intial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1216
        # Set defaults values
1217
1218
1219
1220
1221
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
1222
        kwargs.setdefault("handle_impossible_answer", False)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1223

1224
1225
        if kwargs["topk"] < 1:
            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1226

1227
1228
        if kwargs["max_answer_len"] < 1:
            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1229
1230

        # Convert inputs to features
1231
        examples = self._args_parser(*args, **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1232
1233
        features_list = [
            squad_convert_examples_to_features(
1234
1235
1236
1237
1238
1239
                examples=[example],
                tokenizer=self.tokenizer,
                max_seq_length=kwargs["max_seq_len"],
                doc_stride=kwargs["doc_stride"],
                max_query_length=kwargs["max_question_len"],
                is_training=False,
1240
                tqdm_enabled=False,
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1241
1242
1243
            )
            for example in examples
        ]
Rishabh Manoj's avatar
Rishabh Manoj committed
1244
1245
        all_answers = []
        for features, example in zip(features_list, examples):
Patrick von Platen's avatar
Patrick von Platen committed
1246
1247
            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
            fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
Rishabh Manoj's avatar
Rishabh Manoj committed
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261

            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                    start, end = self.model(fw_args)
                    start, end = start.numpy(), end.numpy()
                else:
                    with torch.no_grad():
                        # Retrieve the score for the context tokens only (removing question tokens)
                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
                        start, end = self.model(**fw_args)
                        start, end = start.cpu().numpy(), end.cpu().numpy()

1262
            min_null_score = 1000000  # large and positive
Rishabh Manoj's avatar
Rishabh Manoj committed
1263
1264
1265
1266
1267
1268
1269
            answers = []
            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1270
1271
1272
1273
                start_, end_ = (
                    start_ * np.abs(np.array(feature.p_mask) - 1),
                    end_ * np.abs(np.array(feature.p_mask) - 1),
                )
Rishabh Manoj's avatar
Rishabh Manoj committed
1274

1275
1276
1277
                if kwargs["handle_impossible_answer"]:
                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())

Rishabh Manoj's avatar
Rishabh Manoj committed
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
                start_[0] = end_[0] = 0

                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                char_to_word = np.array(example.char_to_word_offset)

                # Convert the answer (tokens) back to the original text
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                        ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
1295
1296
1297
1298

            if kwargs["handle_impossible_answer"]:
                answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1299
1300
1301
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
            all_answers += answers

Rishabh Manoj's avatar
Rishabh Manoj committed
1302
        if len(all_answers) == 1:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1303
            return all_answers[0]
Rishabh Manoj's avatar
Rishabh Manoj committed
1304
        return all_answers
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1305
1306

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
        """
        Take the output of any QuestionAnswering head and will generate probalities for each span to be
        the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than
        max_answer_len or answer end position being before the starting position.
        The method supports output the k-best answer through the topk argument.

        Args:
            start: numpy array, holding individual start probabilities for each token
            end: numpy array, holding individual end probabilities for each token
            topk: int, indicates how many possible answer span(s) to extract from the model's output
            max_answer_len: int, maximum size of the answer to extract from the model's output
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), max_answer_len - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if topk == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < topk:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, topk)[0:topk]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end]

    def span_to_answer(self, text: str, start: int, end: int):
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
        """
        When decoding from token probalities, this method maps token indexes to actual word in
        the initial context.

        Args:
            text: str, the actual context to extract the answer from
            start: int, starting answer token index
            end: int, ending answer token index

        Returns:
            dict: {'answer': str, 'start': int, 'end': int}
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # Append words if they are in the span
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)

                words += [word]

            # Stop if we went over the end of the answer
            if token_idx > end:
                break

            # Append the subtokenization length to the running index
            token_idx += len(token)
            chars_idx += len(word) + 1

        # Join text with spaces
1384
1385
1386
1387
1388
        return {
            "answer": " ".join(words),
            "start": max(0, char_start_idx),
            "end": min(len(text), char_end_idx),
        }
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1389
1390


1391
1392
1393
1394
1395
1396
class SummarizationPipeline(Pipeline):
    """
    Summarize news articles and other documents

    Usage::

1397
        # use bart in pytorch
1398
        summarizer = pipeline("summarization")
1399
1400
1401
1402
1403
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)

        # use t5 in tf
        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
1404

1405
1406
1407
1408
    The models that this pipeline can use are models that have been fine-tuned on a summarization task,
    which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.

            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

1439
1440
1441
1442
    def __init__(self, **kwargs):
        kwargs.update(task="summarization")
        super().__init__(**kwargs)

1443
    def __call__(
1444
        self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
    ):
        r"""
        Args:
            *documents: (list of strings) articles to be summarized
            return_text: (bool, default=True) whether to add a decoded "summary_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result

            clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output
            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize

        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate

        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
        assert len(documents) > 0, "Please provide a document to summarize"

        if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__:
            raise NotImplementedError(
                "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`"
            )

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

        if isinstance(documents[0], list):
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"

            documents = ([prefix + document for document in documents[0]],)
1478
            padding = True
1479
1480
1481

        elif isinstance(documents[0], str):
            documents = (prefix + documents[0],)
1482
            padding = False
1483
1484
1485
1486
1487
1488
1489
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
                    documents[0]
                )
            )

1490
        with self.device_placement():
1491
            inputs = self._parse_and_tokenize(*documents, padding=padding)
1492
1493
1494
1495
1496

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]
            elif self.framework == "tf":
1497
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()
1498

1499
1500
            min_length = generate_kwargs.get("min_length", self.model.config.min_length)
            if input_length < min_length // 2:
1501
                logger.warning(
1502
                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
1503
                        min_length, input_length
1504
1505
1506
                    )
                )

1507
1508
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length < max_length:
1509
                logger.warning(
1510
                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
1511
                        max_length, input_length
1512
1513
1514
                    )
                )

1515
            summaries = self.model.generate(
1516
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
1517
            )
1518

1519
1520
1521
1522
1523
1524
1525
            results = []
            for summary in summaries:
                record = {}
                if return_tensors:
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results


class TranslationPipeline(Pipeline):
    """
    Translates from one language to another.

    Usage::
        en_fr_translator = pipeline("translation_en_to_fr")
        en_fr_translator("How old are you?")

1540
1541
1542
1543
    The models that this pipeline can use are models that have been fine-tuned on a translation task,
    currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.
            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.
            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
1572
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1573
1574
1575
    ):
        r"""
        Args:
1576
            *args: (list of strings) texts to be translated
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
            return_text: (bool, default=True) whether to add a decoded "translation_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result

            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate
        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate
        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

1591
        if isinstance(args[0], list):
1592
1593
1594
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
1595
            args = ([prefix + text for text in args[0]],)
1596
            padding = True
1597

1598
1599
        elif isinstance(args[0], str):
            args = (prefix + args[0],)
1600
            padding = False
1601
1602
1603
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
1604
                    args[0]
1605
1606
1607
1608
                )
            )

        with self.device_placement():
1609
            inputs = self._parse_and_tokenize(*args, padding=padding)
1610
1611
1612
1613
1614
1615
1616
1617

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]

            elif self.framework == "tf":
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()

1618
1619
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length > 0.9 * max_length:
1620
1621
                logger.warning(
                    "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
1622
                        input_length, max_length
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
                    )
                )

            translations = self.model.generate(
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
            )
            results = []
            for translation in translations:
                record = {}
                if return_tensors:
                    record["translation_token_ids"] = translation
                if return_text:
                    record["translation_text"] = self.tokenizer.decode(
                        translation,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1639
1640
1641
1642
1643
                    )
                results.append(record)
            return results


1644
# Register all the supported tasks here
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1645
SUPPORTED_TASKS = {
1646
1647
1648
1649
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
        "tf": TFAutoModel if is_tf_available() else None,
        "pt": AutoModel if is_torch_available() else None,
1650
        "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}},
1651
    },
1652
1653
1654
1655
1656
1657
    "sentiment-analysis": {
        "impl": TextClassificationPipeline,
        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
        "default": {
            "model": {
1658
1659
                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
1660
            },
1661
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1662
    },
1663
    "ner": {
1664
        "impl": TokenClassificationPipeline,
1665
1666
1667
1668
        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
        "pt": AutoModelForTokenClassification if is_torch_available() else None,
        "default": {
            "model": {
Julien Chaumond's avatar
Julien Chaumond committed
1669
1670
                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
1671
            },
1672
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1673
    },
1674
1675
1676
1677
1678
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
        "default": {
Lysandre's avatar
E231  
Lysandre committed
1679
            "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
1680
1681
        },
    },
Julien Chaumond's avatar
Julien Chaumond committed
1682
1683
1684
1685
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1686
        "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}},
Julien Chaumond's avatar
Julien Chaumond committed
1687
    },
1688
1689
    "summarization": {
        "impl": SummarizationPipeline,
1690
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
1691
1692
        "pt": AutoModelForSeq2SeqLM if is_torch_available() else None,
        "default": {"model": {"pt": "sshleifer/distilbart-cnn-12-6", "tf": "t5-small"}},
1693
    },
1694
1695
1696
1697
    "translation_en_to_fr": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1698
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1699
1700
1701
1702
1703
    },
    "translation_en_to_de": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1704
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1705
1706
1707
1708
1709
    },
    "translation_en_to_ro": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1710
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1711
    },
1712
1713
1714
1715
    "text-generation": {
        "impl": TextGenerationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1716
        "default": {"model": {"pt": "gpt2", "tf": "gpt2"}},
1717
    },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1718
1719
1720
}


1721
1722
1723
1724
1725
def pipeline(
    task: str,
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
1726
    framework: Optional[str] = None,
1727
1728
    **kwargs
) -> Pipeline:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1729
    """
1730
    Utility factory method to build a pipeline.
Lysandre Debut's avatar
Lysandre Debut committed
1731

1732
    Pipeline are made of:
1733

Lysandre Debut's avatar
Lysandre Debut committed
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
        - A Tokenizer instance in charge of mapping raw textual input to token
        - A Model instance
        - Some (optional) post processing for enhancing model's output


    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:

            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
1745
            - "ner": will return a :class:`~transformers.TokenClassificationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1746
1747
            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
1748
1749
            - "summarization": will return a :class:`~transformers.SummarizationPipeline`
            - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline`
1750
            - "text-generation": will return a :class:`~transformers.TextGenerationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1751
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
1752
1753
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`,
            a model identifier or an actual pre-trained model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1754
1755
1756
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

1757
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1758
1759
        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
1760
            a model identifier or an actual pre-trained model configuration inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1761
1762
            :class:`~transformers.PretrainedConfig`.

1763
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1764
1765
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
1766
            a model identifier or an actual pre-trained tokenizer inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1767
1768
            :class:`~transformers.PreTrainedTokenizer`.

1769
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.

    Returns:
        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
        the task.

    Examples::

        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

        # Sentiment analysis pipeline
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1786
        pipeline('sentiment-analysis')
Lysandre Debut's avatar
Lysandre Debut committed
1787
1788

        # Question answering pipeline, specifying the checkpoint identifier
1789
        pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
Lysandre Debut's avatar
Lysandre Debut committed
1790
1791
1792
1793
1794

        # Named entity recognition pipeline, passing in a specific model and tokenizer
        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        pipeline('ner', model=model, tokenizer=tokenizer)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1795
    """
1796
    # Retrieve the task
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1797
1798
1799
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

1800
    framework = framework or get_framework(model)
1801

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1802
    targeted_task = SUPPORTED_TASKS[task]
1803
    task_class, model_class = targeted_task["impl"], targeted_task[framework]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1804

1805
    # Use default model/config/tokenizer for the task if no model is provided
1806
    if model is None:
1807
        model = targeted_task["default"]["model"][framework]
1808

1809
1810
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
1811
        if isinstance(model, str):
1812
            tokenizer = model
1813
        elif isinstance(config, str):
1814
1815
1816
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
1817
1818
            raise Exception(
                "Impossible to guess which tokenizer to use. "
1819
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
1820
            )
1821

Lysandre Debut's avatar
Lysandre Debut committed
1822
    modelcard = None
1823
    # Try to infer modelcard from model or config name (if provided as str)
Lysandre Debut's avatar
Lysandre Debut committed
1824
1825
1826
1827
    if isinstance(model, str):
        modelcard = model
    elif isinstance(config, str):
        modelcard = config
1828
1829

    # Instantiate tokenizer if needed
1830
1831
1832
1833
1834
1835
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
1836
1837
1838
1839

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)
1840

thomwolf's avatar
thomwolf committed
1841
1842
1843
1844
    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

1845
    # Instantiate model if needed
1846
    if isinstance(model, str):
1847
1848
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch."
            )
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow."
            )
1861
        model = model_class.from_pretrained(model, config=config, **model_kwargs)
1862

1863
    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)