pipelines.py 77.4 KB
Newer Older
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
import csv
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import logging
Morgan Funtowicz's avatar
Morgan Funtowicz committed
20
import os
21
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
22
import sys
Morgan Funtowicz's avatar
Morgan Funtowicz committed
23
from abc import ABC, abstractmethod
24
from contextlib import contextmanager
25
from itertools import chain
26
from os.path import abspath, exists
27
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
Morgan Funtowicz's avatar
Morgan Funtowicz committed
28
29
30

import numpy as np

31
from .configuration_auto import AutoConfig
32
33
34
35
36
37
38
from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
Morgan Funtowicz's avatar
Morgan Funtowicz committed
39

Aymeric Augustin's avatar
Aymeric Augustin committed
40

Morgan Funtowicz's avatar
Morgan Funtowicz committed
41
if is_tf_available():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
42
    import tensorflow as tf
43
    from .modeling_tf_auto import (
44
45
46
47
        TFAutoModel,
        TFAutoModelForSequenceClassification,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
48
        TFAutoModelWithLMHead,
49
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
50
51
52

if is_torch_available():
    import torch
53
    from .modeling_auto import (
54
55
56
57
        AutoModel,
        AutoModelForSequenceClassification,
        AutoModelForQuestionAnswering,
        AutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
58
        AutoModelWithLMHead,
59
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
60

61
62
63
64
if TYPE_CHECKING:
    from .modeling_utils import PreTrainedModel
    from .modeling_tf_utils import TFPreTrainedModel

Morgan Funtowicz's avatar
Morgan Funtowicz committed
65

66
67
logger = logging.getLogger(__name__)

68

thomwolf's avatar
thomwolf committed
69
def get_framework(model=None):
70
    """ Select framework (TensorFlow/PyTorch) to use.
71
        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
72
    """
thomwolf's avatar
thomwolf committed
73
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
Julien Chaumond's avatar
Julien Chaumond committed
74
        # Both framework are available but the user supplied a model class instance.
thomwolf's avatar
thomwolf committed
75
        # Try to guess which framework to use from the model classname
76
        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
77
    elif not is_tf_available() and not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
78
        raise RuntimeError(
79
80
81
82
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
83
    else:
84
        # framework = 'tf' if is_tf_available() else 'pt'
85
        framework = "pt" if is_torch_available() else "tf"
thomwolf's avatar
thomwolf committed
86
87
    return framework

88

89
90
91
92
class ArgumentHandler(ABC):
    """
    Base interface for handling varargs for each Pipeline
    """
93

94
95
96
    @abstractmethod
    def __call__(self, *args, **kwargs):
        raise NotImplementedError()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
97
98


99
100
101
102
class DefaultArgumentHandler(ArgumentHandler):
    """
    Default varargs argument parser handling parameters for each Pipeline
    """
103

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    @staticmethod
    def handle_kwargs(kwargs: Dict) -> List:
        if len(kwargs) == 1:
            output = list(kwargs.values())
        else:
            output = list(chain(kwargs.values()))

        return DefaultArgumentHandler.handle_args(output)

    @staticmethod
    def handle_args(args: Sequence[Any]) -> List[str]:

        # Only one argument, let's do case by case
        if len(args) == 1:
            if isinstance(args[0], str):
119
                return [args[0]]
120
121
122
123
124
125
            elif not isinstance(args[0], list):
                return list(args)
            else:
                return args[0]

        # Multiple arguments (x1, x2, ...)
126
        elif len(args) > 1:
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
            if all([isinstance(arg, str) for arg in args]):
                return list(args)

            # If not instance of list, then it should instance of iterable
            elif isinstance(args, Iterable):
                return list(chain.from_iterable(chain(args)))
            else:
                raise ValueError(
                    "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args))
                )
        else:
            return []

    def __call__(self, *args, **kwargs):
        if len(kwargs) > 0 and len(args) > 0:
            raise ValueError("Pipeline cannot handle mixed args and kwargs")

        if len(kwargs) > 0:
            return DefaultArgumentHandler.handle_kwargs(kwargs)
        else:
            return DefaultArgumentHandler.handle_args(args)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
148
149


150
class PipelineDataFormat:
151
152
153
154
155
    """
    Base class for all the pipeline supported data format both for reading and writing.
    Supported data formats currently includes:
     - JSON
     - CSV
thomwolf's avatar
thomwolf committed
156
     - stdin/stdout (pipe)
157
158
159
160

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
161
162

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
163

164
165
166
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
167
168
        self.output_path = output_path
        self.input_path = input_path
169
        self.column = column.split(",") if column is not None else [""]
170
171
172
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
173
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
174

thomwolf's avatar
thomwolf committed
175
        if output_path is not None and not overwrite:
thomwolf's avatar
thomwolf committed
176
            if exists(abspath(self.output_path)):
177
                raise OSError("{} already exists on disk".format(self.output_path))
178

thomwolf's avatar
thomwolf committed
179
180
        if input_path is not None:
            if not exists(abspath(self.input_path)):
181
                raise OSError("{} doesnt exist on disk".format(self.input_path))
182
183
184
185
186
187
188

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: dict):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
189
190
191
192
193
        """
        Save the provided data object with the representation for the current `DataFormat`.
        :param data: data to store
        :return:
        """
194
195
        raise NotImplementedError()

196
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
197
198
199
200
201
        """
        Save the provided data object as a pickle-formatted binary data on the disk.
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
thomwolf's avatar
thomwolf committed
202
        path, _ = os.path.splitext(self.output_path)
203
        binary_path = os.path.extsep.join((path, "pickle"))
204

205
        with open(binary_path, "wb+") as f_output:
206
207
208
209
            pickle.dump(data, f_output)

        return binary_path

210
    @staticmethod
211
    def from_str(
212
        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
213
214
    ):
        if format == "json":
thomwolf's avatar
thomwolf committed
215
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
216
        elif format == "csv":
thomwolf's avatar
thomwolf committed
217
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
218
        elif format == "pipe":
thomwolf's avatar
thomwolf committed
219
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
220
        else:
221
            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
222
223
224


class CsvPipelineDataFormat(PipelineDataFormat):
225
226
227
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
228
        super().__init__(output_path, input_path, column, overwrite=overwrite)
229
230

    def __iter__(self):
231
        with open(self.input_path, "r") as f:
232
233
234
235
236
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}
                else:
237
                    yield row[self.column[0]]
238
239

    def save(self, data: List[dict]):
240
        with open(self.output_path, "w") as f:
241
242
243
244
245
246
247
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
                writer.writerows(data)


class JsonPipelineDataFormat(PipelineDataFormat):
248
249
250
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
251
        super().__init__(output_path, input_path, column, overwrite=overwrite)
252

253
        with open(input_path, "r") as f:
254
255
256
257
258
259
260
            self._entries = json.load(f)

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}
            else:
261
                yield entry[self.column[0]]
262
263

    def save(self, data: dict):
264
        with open(self.output_path, "w") as f:
265
266
267
            json.dump(data, f)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
268
269
270
271
272
273
274
class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process.
    For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
275

Morgan Funtowicz's avatar
Morgan Funtowicz committed
276
277
278
    def __iter__(self):
        for line in sys.stdin:
            # Split for multi-columns
279
            if "\t" in line:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
280

281
                line = line.split("\t")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
282
283
284
285
286
287
288
289
290
291
292
293
294
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        print(data)

295
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
thomwolf's avatar
thomwolf committed
296
        if self.output_path is None:
297
            raise KeyError(
298
299
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
300
301
302
303
            )

        return super().save_binary(data)

Morgan Funtowicz's avatar
Morgan Funtowicz committed
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        raise NotImplementedError()


319
class Pipeline(_ScikitCompat):
320
    """
Lysandre Debut's avatar
Lysandre Debut committed
321
322
323
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.

324
325
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
326

327
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Morgan Funtowicz's avatar
Morgan Funtowicz committed
328
329
330
331
332
333
334
335
336
337

    Pipeline supports running on CPU or GPU through the device argument. Users can specify
    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.

    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
    provide the binary_output constructor argument. If set to True, the output will be stored in the
    pickle format.

    Arguments:
338
339
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
340
341
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
342
343
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
344
345
346
347
348
349
350
351
352
353
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
354
            Reference to the object in charge of parsing supplied pipeline parameters.
Lysandre Debut's avatar
Lysandre Debut committed
355
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
356
357
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Lysandre Debut's avatar
Lysandre Debut committed
358
        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
359
360
361
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.

    Return:
Lysandre Debut's avatar
Lysandre Debut committed
362
        :obj:`List` or :obj:`Dict`:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
363
        Pipeline returns list or dictionary depending on:
Lysandre Debut's avatar
Lysandre Debut committed
364
365
366

         - Whether the user supplied multiple samples
         - Whether the pipeline exposes multiple fields in the output object
367
    """
thomwolf's avatar
thomwolf committed
368
369
370

    default_input_names = None

371
372
    def __init__(
        self,
373
374
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
375
        modelcard: Optional[ModelCard] = None,
376
        framework: Optional[str] = None,
377
        task: str = "",
378
379
380
381
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
    ):
382

thomwolf's avatar
thomwolf committed
383
384
385
        if framework is None:
            framework = get_framework()

386
387
        self.model = model
        self.tokenizer = tokenizer
388
        self.modelcard = modelcard
thomwolf's avatar
thomwolf committed
389
        self.framework = framework
390
        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
391
        self.binary_output = binary_output
392
393
        self._args_parser = args_parser or DefaultArgumentHandler()

394
        # Special handling
395
396
        if self.framework == "pt" and self.device.type == "cuda":
            self.model = self.model.to(self.device)
397

398
399
400
401
402
        # Update config with task specific parameters
        task_specific_params = self.model.config.task_specific_params
        if task_specific_params is not None and task in task_specific_params:
            self.model.config.update(task_specific_params.get(task))

403
    def save_pretrained(self, save_directory):
404
405
406
        """
        Save the pipeline's model and tokenizer to the specified save_directory
        """
407
408
409
410
411
412
        if not os.path.isdir(save_directory):
            logger.error("Provided path ({}) should be a directory".format(save_directory))
            return

        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)
413
414
        if self.modelcard is not None:
            self.modelcard.save_pretrained(save_directory)
415
416

    def transform(self, X):
417
418
419
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
420
421
422
        return self(X=X)

    def predict(self, X):
423
424
425
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
426
        return self(X=X)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
427

428
429
    @contextmanager
    def device_placement(self):
430
431
432
433
434
435
436
437
438
439
440
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
        example:
            # Explicitly ask for tensor allocation on CUDA device :0
            nlp = pipeline(..., device=0)
            with nlp.device_placement():
                # Every framework specific tensor allocation will be done on the request device
                output = nlp(...)
        Returns:
            Context manager
        """
441
442
        if self.framework == "tf":
            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
443
444
                yield
        else:
445
            if self.device.type == "cuda":
446
                torch.cuda.set_device(self.device)
447

448
            yield
449

450
451
452
453
454
455
456
457
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.
        :param inputs:
        :return:
        """
        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

458
    def _parse_and_tokenize(self, *args, pad_to_max_length=True, add_special_tokens=True, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
459
460
461
        """
        Parse arguments and tokenize
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
462
        # Parse arguments
463
        inputs = self._args_parser(*args, **kwargs)
464
        inputs = self.tokenizer.batch_encode_plus(
465
466
467
468
            inputs,
            add_special_tokens=add_special_tokens,
            return_tensors=self.framework,
            pad_to_max_length=pad_to_max_length,
469
        )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
470

Julien Chaumond's avatar
Julien Chaumond committed
471
472
        return inputs

473
474
    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
475
        return self._forward(inputs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
476

Julien Chaumond's avatar
Julien Chaumond committed
477
    def _forward(self, inputs, return_tensors=False):
478
479
480
481
        """
        Internal framework specific forward dispatching.
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
Julien Chaumond's avatar
Julien Chaumond committed
482
            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
483
484
485
        Returns:
            Numpy array
        """
486
487
488
489
        # Encode for forward
        with self.device_placement():
            if self.framework == "tf":
                # TODO trace model
Funtowicz Morgan's avatar
Funtowicz Morgan committed
490
                predictions = self.model(inputs.data, training=False)[0]
491
492
493
494
            else:
                with torch.no_grad():
                    inputs = self.ensure_tensor_on_device(**inputs)
                    predictions = self.model(**inputs)[0].cpu()
495

Julien Chaumond's avatar
Julien Chaumond committed
496
497
498
499
        if return_tensors:
            return predictions
        else:
            return predictions.numpy()
500
501
502


class FeatureExtractionPipeline(Pipeline):
503
    """
Lysandre Debut's avatar
Lysandre Debut committed
504
    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
505
    which can be used as features in downstream tasks.
Lysandre Debut's avatar
Lysandre Debut committed
506
507
508
509
510
511
512
513
514
515

    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "feature-extraction", for extracting features of a sequence.

    All models may be used for this pipeline. See a list of all models, including community-contributed models on
    `huggingface.co/models <https://huggingface.co/models>`__.

    Arguments:
516
517
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
518
519
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
520
521
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
522
523
524
525
526
527
528
529
530
531
532
533
534
535
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
536
    """
537

538
539
    def __init__(
        self,
540
541
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
542
        modelcard: Optional[ModelCard] = None,
543
544
545
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
546
        task: str = "",
547
548
549
550
551
552
553
554
555
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
556
            task=task,
557
        )
558

559
560
    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs).tolist()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
561
562


563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
class TextGenerationPipeline(Pipeline):
    """
    Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt.

    This language generation pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "text-generation", for generating text from a specified prompt.

    The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective,
    which includes the uni-directional models in the library (e.g. gpt2).
    See the list of available community models on
    `huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
    """

    # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
    # in https://github.com/rusiaaman/XLNet-gen#methodology
    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
581

582
583
584
585
586
587
588
589
590
591
592
    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
    (except for Alexei and Maria) are discovered.
    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
    remainder of the story. 1883 Western Siberia,
    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
    Rasputin has a vision and denounces one of the men as a horse thief. Although his
    father initially slaps him for making such an accusation, Rasputin watches as the
    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
    with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""

593
594
595
596
597
598
599
600
601
602
603
604
605
606
    ALLOWED_MODELS = [
        "XLNetLMHeadModel",
        "TransfoXLLMHeadModel",
        "ReformerModelWithLMHead",
        "GPT2LMHeadModel",
        "OpenAIGPTLMHeadModel",
        "CTRLLMHeadModel",
        "TFXLNetLMHeadModel",
        "TFTransfoXLLMHeadModel",
        "TFGPT2LMHeadModel",
        "TFOpenAIGPTLMHeadModel",
        "TFCTRLLMHeadModel",
    ]

607
    def __call__(
608
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
609
    ):
610
611
612
613
614
615
616
        if self.model.__class__.__name__ not in self.ALLOWED_MODELS:
            raise NotImplementedError(
                "Generation is currently not supported for {}. Please select a model from {} for generation.".format(
                    self.model.__class__.__name__, self.ALLOWED_MODELS
                )
            )

617
        text_inputs = self._args_parser(*args)
618
619
620
621
622
623

        results = []
        for prompt_text in text_inputs:
            # Manage correct placement of the tensors
            with self.device_placement():
                if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]:
624
625
626
                    inputs = self._parse_and_tokenize(
                        self.PADDING_TEXT + prompt_text, pad_to_max_length=False, add_special_tokens=False
                    )
627
                else:
628
                    inputs = self._parse_and_tokenize(prompt_text, pad_to_max_length=False, add_special_tokens=False)
629

630
631
632
633
634
635
                # set input_ids to None to allow empty prompt
                if inputs["input_ids"].shape[-1] == 0:
                    inputs["input_ids"] = None
                    inputs["attention_mask"] = None

                if self.framework == "pt" and inputs["input_ids"] is not None:
636
637
638
639
640
641
                    inputs = self.ensure_tensor_on_device(**inputs)

                input_ids = inputs["input_ids"]

                # Ensure that batch size = 1 (batch generation not allowed for now)
                assert (
642
                    input_ids is None or input_ids.shape[0] == 1
643
644
645
646
647
648
                ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information."

                output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs)  # BS x SL

            result = []
            for generated_sequence in output_sequences:
649
                generated_sequence = generated_sequence.numpy().tolist()
650
651
652
653
654
655
656
657
658
659
660
661
                record = {}
                if return_tensors:
                    record["generated_token_ids"] = generated_sequence
                if return_text:
                    # Decode text
                    text = self.tokenizer.decode(
                        generated_sequence,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )

                    # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
662
663
664
665
666
667
668
669
670
671
672
673
                    if input_ids is None:
                        prompt_length = 0
                    else:
                        prompt_length = len(
                            self.tokenizer.decode(
                                input_ids[0],
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                            )
                        )

                    record["generated_text"] = prompt_text + text[prompt_length:]
674
675
676
677
678
679
680
681
682
683

                result.append(record)
            results += [result]

        if len(results) == 1:
            return results[0]

        return results


Morgan Funtowicz's avatar
Morgan Funtowicz committed
684
class TextClassificationPipeline(Pipeline):
685
    """
Lysandre Debut's avatar
Lysandre Debut committed
686
687
688
689
690
691
692
693
694
    Text classification pipeline using ModelForSequenceClassification head. See the
    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.

    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
695
696
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
697
698

    Arguments:
699
700
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
701
702
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
703
704
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
705
706
707
708
709
710
711
712
713
714
715
716
717
718
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
719
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
720

721
722
723
724
725
    def __init__(self, return_all_scores: bool = False, **kwargs):
        super().__init__(**kwargs)

        self.return_all_scores = return_all_scores

726
    def __call__(self, *args, **kwargs):
727
        outputs = super().__call__(*args, **kwargs)
Zhiyu Lin's avatar
Zhiyu Lin committed
728
        scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
729
730
        if self.return_all_scores:
            return [
731
                [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
732
733
734
735
736
737
                for item in scores
            ]
        else:
            return [
                {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
            ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
738
739


Julien Chaumond's avatar
Julien Chaumond committed
740
741
class FillMaskPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
742
743
744
745
746
747
748
749
750
751
    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.

    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "fill-mask", for predicting masked tokens in a sequence.

    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
    which includes the bi-directional models in the library.
752
753
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=lm-head>`__.
Lysandre Debut's avatar
Lysandre Debut committed
754
755

    Arguments:
756
757
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
758
759
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
760
761
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
762
763
764
765
766
767
768
769
770
771
772
773
774
775
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Julien Chaumond's avatar
Julien Chaumond committed
776
777
778
779
    """

    def __init__(
        self,
780
781
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
782
        modelcard: Optional[ModelCard] = None,
Julien Chaumond's avatar
Julien Chaumond committed
783
784
785
786
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        topk=5,
787
        task: str = "",
Julien Chaumond's avatar
Julien Chaumond committed
788
789
790
791
792
793
794
795
796
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
797
            task=task,
Julien Chaumond's avatar
Julien Chaumond committed
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
        )

        self.topk = topk

    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
        outputs = self._forward(inputs, return_tensors=True)

        results = []
        batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)

        for i in range(batch_size):
            input_ids = inputs["input_ids"][i]
            result = []

            if self.framework == "tf":
                masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item()
                logits = outputs[i, masked_index, :]
                probs = tf.nn.softmax(logits)
                topk = tf.math.top_k(probs, k=self.topk)
                values, predictions = topk.values.numpy(), topk.indices.numpy()
            else:
                masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item()
                logits = outputs[i, masked_index, :]
                probs = logits.softmax(dim=0)
                values, predictions = probs.topk(self.topk)

            for v, p in zip(values.tolist(), predictions.tolist()):
                tokens = input_ids.numpy()
                tokens[masked_index] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
830
831
832
833
834
835
836
837
                result.append(
                    {
                        "sequence": self.tokenizer.decode(tokens),
                        "score": v,
                        "token": p,
                        "token_str": self.tokenizer.convert_ids_to_tokens(p),
                    }
                )
Julien Chaumond's avatar
Julien Chaumond committed
838
839
840
841
842
843
844
845
846

            # Append
            results += [result]

        if len(results) == 1:
            return results[0]
        return results


847
class TokenClassificationPipeline(Pipeline):
848
    """
Lysandre Debut's avatar
Lysandre Debut committed
849
850
851
852
853
854
855
856
857
    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.

    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.

    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
858
859
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
860
861

    Arguments:
862
863
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
864
865
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
866
867
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
868
869
870
871
872
873
874
875
876
877
878
879
880
881
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
882
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
883

884
885
886
887
    default_input_names = "sequences"

    def __init__(
        self,
888
889
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
890
        modelcard: Optional[ModelCard] = None,
891
892
893
894
895
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
896
        task: str = "",
897
        grouped_entities: bool = False,
898
899
900
901
902
903
904
905
906
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
907
            task=task,
908
        )
909
910

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
911
        self.ignore_labels = ignore_labels
912
        self.grouped_entities = grouped_entities
913

914
915
    def __call__(self, *args, **kwargs):
        inputs = self._args_parser(*args, **kwargs)
Julien Chaumond's avatar
Julien Chaumond committed
916
        answers = []
917
        for sentence in inputs:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
918

919
920
            # Manage correct placement of the tensors
            with self.device_placement():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
921

922
                tokens = self.tokenizer.encode_plus(
923
924
                    sentence,
                    return_attention_mask=False,
thomwolf's avatar
thomwolf committed
925
                    return_tensors=self.framework,
926
                    max_length=self.tokenizer.max_len,
927
                )
928
929

                # Forward
930
                if self.framework == "tf":
Funtowicz Morgan's avatar
Funtowicz Morgan committed
931
                    entities = self.model(tokens.data)[0][0].numpy()
932
                    input_ids = tokens["input_ids"].numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
933
                else:
934
                    with torch.no_grad():
935
                        tokens = self.ensure_tensor_on_device(**tokens)
936
                        entities = self.model(**tokens)[0][0].cpu().numpy()
937
                        input_ids = tokens["input_ids"].cpu().numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
938

thomwolf's avatar
thomwolf committed
939
940
            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
941

942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
            entities = []
            entity_groups = []
            entity_group_disagg = []
            # Filter to labels not in `self.ignore_labels`
            filtered_labels_idx = [
                (idx, label_idx)
                for idx, label_idx in enumerate(labels_idx)
                if self.model.config.id2label[label_idx] not in self.ignore_labels
            ]

            for idx, label_idx in filtered_labels_idx:

                entity = {
                    "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
                    "score": score[idx][label_idx].item(),
                    "entity": self.model.config.id2label[label_idx],
                    "index": idx,
                }
                last_idx, _ = filtered_labels_idx[-1]
                if self.grouped_entities:
                    if not entity_group_disagg:
                        entity_group_disagg += [entity]
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                        continue

                    # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
                    if (
                        entity["entity"] == entity_group_disagg[-1]["entity"]
                        and entity["index"] == entity_group_disagg[-1]["index"] + 1
                    ):
                        entity_group_disagg += [entity]
                        # Group the entities at the last entity
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                    # If the current entity is different from the previous entity, aggregate the disaggregated entity group
                    else:
                        entity_groups += [self.group_entities(entity_group_disagg)]
                        entity_group_disagg = [entity]

                entities += [entity]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
983
984

            # Append
985
986
987
988
989
            if self.grouped_entities:
                answers += [entity_groups]
            else:
                answers += [entities]

thomwolf's avatar
thomwolf committed
990
991
        if len(answers) == 1:
            return answers[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
992
993
        return answers

994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
    def group_entities(self, entities):
        """
        Returns grouped entities
        """
        # Get the last entity in the entity group
        entity = entities[-1]["entity"]
        scores = np.mean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
        }
        return entity_group

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1010

1011
NerPipeline = TokenClassificationPipeline
1012
1013


1014
1015
1016
1017
1018
1019
1020
1021
class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
    to internal SquadExample / SquadFeature structures.

    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
    arguments.
    """
1022

1023
1024
1025
1026
    def __call__(self, *args, **kwargs):
        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
        if args is not None and len(args) > 0:
            if len(args) == 1:
1027
                kwargs["X"] = args[0]
1028
            else:
1029
                kwargs["X"] = list(args)
1030

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1031
1032
        # Generic compatibility with sklearn and Keras
        # Batched data
1033
1034
        if "X" in kwargs or "data" in kwargs:
            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
1035

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1036
1037
1038
1039
1040
            if isinstance(inputs, dict):
                inputs = [inputs]
            else:
                # Copy to avoid overriding arguments
                inputs = [i for i in inputs]
1041

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1042
            for i, item in enumerate(inputs):
1043
                if isinstance(item, dict):
1044
1045
                    if any(k not in item for k in ["question", "context"]):
                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
1046

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1047
1048
1049
                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)

                elif not isinstance(item, SquadExample):
1050
                    raise ValueError(
1051
1052
1053
                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
                            "X" if "X" in kwargs else "data"
                        )
1054
1055
1056
                    )

            # Tabular input
1057
1058
1059
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], str):
                kwargs["question"] = [kwargs["question"]]
1060

1061
1062
            if isinstance(kwargs["context"], str):
                kwargs["context"] = [kwargs["context"]]
1063

1064
1065
1066
            inputs = [
                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
            ]
1067
        else:
1068
            raise ValueError("Unknown arguments {}".format(kwargs))
1069
1070
1071
1072
1073
1074
1075

        if not isinstance(inputs, list):
            inputs = [inputs]

        return inputs


Morgan Funtowicz's avatar
Morgan Funtowicz committed
1076
1077
class QuestionAnsweringPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
1078
1079
1080
1081
1082
1083
1084
1085
1086
    Question Answering pipeline using ModelForQuestionAnswering head. See the
    `question answering usage <../usage.html#question-answering>`__ examples for more information.

    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "question-answering", for answering questions given a context.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
1087
1088
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
Lysandre Debut's avatar
Lysandre Debut committed
1089
1090

    Arguments:
1091
1092
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1093
1094
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
1095
1096
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1111
1112
    """

1113
1114
1115
1116
    default_input_names = "question,context"

    def __init__(
        self,
1117
1118
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
1119
        modelcard: Optional[ModelCard] = None,
1120
1121
        framework: Optional[str] = None,
        device: int = -1,
1122
        task: str = "",
1123
1124
1125
1126
1127
1128
1129
1130
1131
        **kwargs
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=QuestionAnsweringArgumentHandler(),
            device=device,
1132
            task=task,
1133
            **kwargs,
1134
        )
thomwolf's avatar
thomwolf committed
1135

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1136
    @staticmethod
1137
1138
1139
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
1140
1141
1142
1143
        """
        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
        We currently support extractive question answering.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1144
        Arguments:
1145
1146
             question: (str, List[str]) The question to be ask for the associated context
             context: (str, List[str]) The context in which we will look for the answer.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1147
1148
1149

        Returns:
            SquadExample initialized with the corresponding question and context.
1150
1151
        """
        if isinstance(question, list):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1152
1153
1154
1155
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)

1156
    def __call__(self, *args, **kwargs):
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the intial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1171
        # Set defaults values
1172
1173
1174
1175
1176
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
1177
        kwargs.setdefault("handle_impossible_answer", False)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1178

1179
1180
        if kwargs["topk"] < 1:
            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1181

1182
1183
        if kwargs["max_answer_len"] < 1:
            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1184
1185

        # Convert inputs to features
1186
        examples = self._args_parser(*args, **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1187
1188
1189
1190
1191
1192
1193
1194
        features_list = [
            squad_convert_examples_to_features(
                [example],
                self.tokenizer,
                kwargs["max_seq_len"],
                kwargs["doc_stride"],
                kwargs["max_question_len"],
                False,
1195
                tqdm_enabled=False,
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1196
1197
1198
            )
            for example in examples
        ]
Rishabh Manoj's avatar
Rishabh Manoj committed
1199
1200
        all_answers = []
        for features, example in zip(features_list, examples):
Patrick von Platen's avatar
Patrick von Platen committed
1201
1202
            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
            fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
Rishabh Manoj's avatar
Rishabh Manoj committed
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216

            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                    start, end = self.model(fw_args)
                    start, end = start.numpy(), end.numpy()
                else:
                    with torch.no_grad():
                        # Retrieve the score for the context tokens only (removing question tokens)
                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
                        start, end = self.model(**fw_args)
                        start, end = start.cpu().numpy(), end.cpu().numpy()

1217
            min_null_score = 1000000  # large and positive
Rishabh Manoj's avatar
Rishabh Manoj committed
1218
1219
1220
1221
1222
1223
1224
            answers = []
            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1225
1226
1227
1228
                start_, end_ = (
                    start_ * np.abs(np.array(feature.p_mask) - 1),
                    end_ * np.abs(np.array(feature.p_mask) - 1),
                )
Rishabh Manoj's avatar
Rishabh Manoj committed
1229

1230
1231
1232
                if kwargs["handle_impossible_answer"]:
                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())

Rishabh Manoj's avatar
Rishabh Manoj committed
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
                start_[0] = end_[0] = 0

                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                char_to_word = np.array(example.char_to_word_offset)

                # Convert the answer (tokens) back to the original text
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                        ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
1250
1251
1252
1253

            if kwargs["handle_impossible_answer"]:
                answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1254
1255
1256
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
            all_answers += answers

Rishabh Manoj's avatar
Rishabh Manoj committed
1257
        if len(all_answers) == 1:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1258
            return all_answers[0]
Rishabh Manoj's avatar
Rishabh Manoj committed
1259
        return all_answers
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1260
1261

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
        """
        Take the output of any QuestionAnswering head and will generate probalities for each span to be
        the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than
        max_answer_len or answer end position being before the starting position.
        The method supports output the k-best answer through the topk argument.

        Args:
            start: numpy array, holding individual start probabilities for each token
            end: numpy array, holding individual end probabilities for each token
            topk: int, indicates how many possible answer span(s) to extract from the model's output
            max_answer_len: int, maximum size of the answer to extract from the model's output
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), max_answer_len - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if topk == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < topk:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, topk)[0:topk]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end]

    def span_to_answer(self, text: str, start: int, end: int):
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
        """
        When decoding from token probalities, this method maps token indexes to actual word in
        the initial context.

        Args:
            text: str, the actual context to extract the answer from
            start: int, starting answer token index
            end: int, ending answer token index

        Returns:
            dict: {'answer': str, 'start': int, 'end': int}
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # Append words if they are in the span
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)

                words += [word]

            # Stop if we went over the end of the answer
            if token_idx > end:
                break

            # Append the subtokenization length to the running index
            token_idx += len(token)
            chars_idx += len(word) + 1

        # Join text with spaces
1339
1340
1341
1342
1343
        return {
            "answer": " ".join(words),
            "start": max(0, char_start_idx),
            "end": min(len(text), char_end_idx),
        }
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1344
1345


1346
1347
1348
1349
1350
1351
class SummarizationPipeline(Pipeline):
    """
    Summarize news articles and other documents

    Usage::

1352
        # use bart in pytorch
1353
        summarizer = pipeline("summarization")
1354
1355
1356
1357
1358
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)

        # use t5 in tf
        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
1359

1360
1361
1362
1363
    The models that this pipeline can use are models that have been fine-tuned on a summarization task,
    which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.

            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
1395
        self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
    ):
        r"""
        Args:
            *documents: (list of strings) articles to be summarized
            return_text: (bool, default=True) whether to add a decoded "summary_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result

            clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output
            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize

        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate

        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
        assert len(documents) > 0, "Please provide a document to summarize"

        if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__:
            raise NotImplementedError(
                "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`"
            )

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

        if isinstance(documents[0], list):
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"

            documents = ([prefix + document for document in documents[0]],)
            pad_to_max_length = True

        elif isinstance(documents[0], str):
            documents = (prefix + documents[0],)
            pad_to_max_length = False
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
                    documents[0]
                )
            )

1441
        with self.device_placement():
1442
1443
1444
1445
1446
1447
            inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length)

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]
            elif self.framework == "tf":
1448
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()
1449

1450
1451
            min_length = generate_kwargs.get("min_length", self.model.config.min_length)
            if input_length < min_length // 2:
1452
                logger.warning(
1453
                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
1454
                        min_length, input_length
1455
1456
1457
                    )
                )

1458
1459
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length < max_length:
1460
                logger.warning(
1461
                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
1462
                        max_length, input_length
1463
1464
1465
                    )
                )

1466
            summaries = self.model.generate(
1467
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
1468
            )
1469

1470
1471
1472
1473
1474
1475
1476
            results = []
            for summary in summaries:
                record = {}
                if return_tensors:
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results


class TranslationPipeline(Pipeline):
    """
    Translates from one language to another.

    Usage::
        en_fr_translator = pipeline("translation_en_to_fr")
        en_fr_translator("How old are you?")

1491
1492
1493
1494
    The models that this pipeline can use are models that have been fine-tuned on a translation task,
    currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.
            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.
            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
1523
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1524
1525
1526
    ):
        r"""
        Args:
1527
            *args: (list of strings) texts to be translated
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
            return_text: (bool, default=True) whether to add a decoded "translation_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result

            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate
        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate
        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

1542
        if isinstance(args[0], list):
1543
1544
1545
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
1546
            args = ([prefix + text for text in args[0]],)
1547
1548
            pad_to_max_length = True

1549
1550
        elif isinstance(args[0], str):
            args = (prefix + args[0],)
1551
1552
1553
1554
            pad_to_max_length = False
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
1555
                    args[0]
1556
1557
1558
1559
                )
            )

        with self.device_placement():
1560
            inputs = self._parse_and_tokenize(*args, pad_to_max_length=pad_to_max_length)
1561
1562
1563
1564
1565
1566
1567
1568

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]

            elif self.framework == "tf":
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()

1569
1570
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length > 0.9 * max_length:
1571
1572
                logger.warning(
                    "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
1573
                        input_length, max_length
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
                    )
                )

            translations = self.model.generate(
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
            )
            results = []
            for translation in translations:
                record = {}
                if return_tensors:
                    record["translation_token_ids"] = translation
                if return_text:
                    record["translation_text"] = self.tokenizer.decode(
                        translation,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1590
1591
1592
1593
1594
                    )
                results.append(record)
            return results


1595
# Register all the supported tasks here
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1596
SUPPORTED_TASKS = {
1597
1598
1599
1600
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
        "tf": TFAutoModel if is_tf_available() else None,
        "pt": AutoModel if is_torch_available() else None,
1601
        "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}},
1602
    },
1603
1604
1605
1606
1607
1608
    "sentiment-analysis": {
        "impl": TextClassificationPipeline,
        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
        "default": {
            "model": {
1609
1610
                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
1611
            },
1612
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1613
    },
1614
    "ner": {
1615
        "impl": TokenClassificationPipeline,
1616
1617
1618
1619
        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
        "pt": AutoModelForTokenClassification if is_torch_available() else None,
        "default": {
            "model": {
Julien Chaumond's avatar
Julien Chaumond committed
1620
1621
                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
1622
            },
1623
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1624
    },
1625
1626
1627
1628
1629
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
        "default": {
Lysandre's avatar
E231  
Lysandre committed
1630
            "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
1631
1632
        },
    },
Julien Chaumond's avatar
Julien Chaumond committed
1633
1634
1635
1636
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1637
        "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}},
Julien Chaumond's avatar
Julien Chaumond committed
1638
    },
1639
1640
    "summarization": {
        "impl": SummarizationPipeline,
1641
1642
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1643
        "default": {"model": {"pt": "facebook/bart-large-cnn", "tf": "t5-small"}},
1644
    },
1645
1646
1647
1648
    "translation_en_to_fr": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1649
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1650
1651
1652
1653
1654
    },
    "translation_en_to_de": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1655
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1656
1657
1658
1659
1660
    },
    "translation_en_to_ro": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1661
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1662
    },
1663
1664
1665
1666
    "text-generation": {
        "impl": TextGenerationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1667
        "default": {"model": {"pt": "gpt2", "tf": "gpt2"}},
1668
    },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1669
1670
1671
}


1672
1673
1674
1675
1676
def pipeline(
    task: str,
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
1677
    framework: Optional[str] = None,
1678
1679
    **kwargs
) -> Pipeline:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1680
    """
1681
    Utility factory method to build a pipeline.
Lysandre Debut's avatar
Lysandre Debut committed
1682

1683
    Pipeline are made of:
1684

Lysandre Debut's avatar
Lysandre Debut committed
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
        - A Tokenizer instance in charge of mapping raw textual input to token
        - A Model instance
        - Some (optional) post processing for enhancing model's output


    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:

            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
1696
            - "ner": will return a :class:`~transformers.TokenClassificationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1697
1698
            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
1699
1700
            - "summarization": will return a :class:`~transformers.SummarizationPipeline`
            - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline`
1701
            - "text-generation": will return a :class:`~transformers.TextGenerationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1702
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
1703
1704
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`,
            a model identifier or an actual pre-trained model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1705
1706
1707
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

1708
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1709
1710
        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
1711
            a model identifier or an actual pre-trained model configuration inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1712
1713
            :class:`~transformers.PretrainedConfig`.

1714
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1715
1716
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
1717
            a model identifier or an actual pre-trained tokenizer inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1718
1719
            :class:`~transformers.PreTrainedTokenizer`.

1720
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.

    Returns:
        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
        the task.

    Examples::

        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

        # Sentiment analysis pipeline
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1737
        pipeline('sentiment-analysis')
Lysandre Debut's avatar
Lysandre Debut committed
1738
1739

        # Question answering pipeline, specifying the checkpoint identifier
1740
        pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
Lysandre Debut's avatar
Lysandre Debut committed
1741
1742
1743
1744
1745

        # Named entity recognition pipeline, passing in a specific model and tokenizer
        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        pipeline('ner', model=model, tokenizer=tokenizer)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1746
    """
1747
    # Retrieve the task
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1748
1749
1750
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

1751
    framework = framework or get_framework(model)
1752

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1753
    targeted_task = SUPPORTED_TASKS[task]
1754
    task_class, model_class = targeted_task["impl"], targeted_task[framework]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1755

1756
    # Use default model/config/tokenizer for the task if no model is provided
1757
    if model is None:
1758
        model = targeted_task["default"]["model"][framework]
1759

1760
1761
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
1762
        if isinstance(model, str):
1763
            tokenizer = model
1764
        elif isinstance(config, str):
1765
1766
1767
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
1768
1769
            raise Exception(
                "Impossible to guess which tokenizer to use. "
1770
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
1771
            )
1772

Lysandre Debut's avatar
Lysandre Debut committed
1773
    modelcard = None
1774
    # Try to infer modelcard from model or config name (if provided as str)
Lysandre Debut's avatar
Lysandre Debut committed
1775
1776
1777
1778
    if isinstance(model, str):
        modelcard = model
    elif isinstance(config, str):
        modelcard = config
1779
1780

    # Instantiate tokenizer if needed
1781
1782
1783
1784
1785
1786
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
1787
1788
1789
1790

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)
1791

thomwolf's avatar
thomwolf committed
1792
1793
1794
1795
    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

1796
    # Instantiate model if needed
1797
    if isinstance(model, str):
1798
1799
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch."
            )
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow."
            )
1812
        model = model_class.from_pretrained(model, config=config, **model_kwargs)
1813

1814
    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)