pipelines.py 78.1 KB
Newer Older
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
import csv
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import logging
Morgan Funtowicz's avatar
Morgan Funtowicz committed
20
import os
21
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
22
import sys
Morgan Funtowicz's avatar
Morgan Funtowicz committed
23
from abc import ABC, abstractmethod
24
from contextlib import contextmanager
25
from itertools import chain
26
from os.path import abspath, exists
27
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
Morgan Funtowicz's avatar
Morgan Funtowicz committed
28
29
30

import numpy as np

31
from .configuration_auto import AutoConfig
32
33
34
35
36
37
38
from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
Morgan Funtowicz's avatar
Morgan Funtowicz committed
39

Aymeric Augustin's avatar
Aymeric Augustin committed
40

Morgan Funtowicz's avatar
Morgan Funtowicz committed
41
if is_tf_available():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
42
    import tensorflow as tf
43
    from .modeling_tf_auto import (
44
45
46
47
        TFAutoModel,
        TFAutoModelForSequenceClassification,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
48
        TFAutoModelWithLMHead,
49
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
50
51
52

if is_torch_available():
    import torch
53
    from .modeling_auto import (
54
55
56
57
        AutoModel,
        AutoModelForSequenceClassification,
        AutoModelForQuestionAnswering,
        AutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
58
        AutoModelWithLMHead,
59
        AutoModelForSeq2SeqLM,
60
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
61

62
63
64
65
if TYPE_CHECKING:
    from .modeling_utils import PreTrainedModel
    from .modeling_tf_utils import TFPreTrainedModel

Morgan Funtowicz's avatar
Morgan Funtowicz committed
66

67
68
logger = logging.getLogger(__name__)

69

thomwolf's avatar
thomwolf committed
70
def get_framework(model=None):
71
    """ Select framework (TensorFlow/PyTorch) to use.
72
        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
73
    """
thomwolf's avatar
thomwolf committed
74
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
Julien Chaumond's avatar
Julien Chaumond committed
75
        # Both framework are available but the user supplied a model class instance.
thomwolf's avatar
thomwolf committed
76
        # Try to guess which framework to use from the model classname
77
        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
78
    elif not is_tf_available() and not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
79
        raise RuntimeError(
80
81
82
83
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
84
    else:
85
        # framework = 'tf' if is_tf_available() else 'pt'
86
        framework = "pt" if is_torch_available() else "tf"
thomwolf's avatar
thomwolf committed
87
88
    return framework

89

90
91
92
93
class ArgumentHandler(ABC):
    """
    Base interface for handling varargs for each Pipeline
    """
94

95
96
97
    @abstractmethod
    def __call__(self, *args, **kwargs):
        raise NotImplementedError()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
98
99


100
101
102
103
class DefaultArgumentHandler(ArgumentHandler):
    """
    Default varargs argument parser handling parameters for each Pipeline
    """
104

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    @staticmethod
    def handle_kwargs(kwargs: Dict) -> List:
        if len(kwargs) == 1:
            output = list(kwargs.values())
        else:
            output = list(chain(kwargs.values()))

        return DefaultArgumentHandler.handle_args(output)

    @staticmethod
    def handle_args(args: Sequence[Any]) -> List[str]:

        # Only one argument, let's do case by case
        if len(args) == 1:
            if isinstance(args[0], str):
120
                return [args[0]]
121
122
123
124
125
126
            elif not isinstance(args[0], list):
                return list(args)
            else:
                return args[0]

        # Multiple arguments (x1, x2, ...)
127
        elif len(args) > 1:
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
            if all([isinstance(arg, str) for arg in args]):
                return list(args)

            # If not instance of list, then it should instance of iterable
            elif isinstance(args, Iterable):
                return list(chain.from_iterable(chain(args)))
            else:
                raise ValueError(
                    "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args))
                )
        else:
            return []

    def __call__(self, *args, **kwargs):
        if len(kwargs) > 0 and len(args) > 0:
            raise ValueError("Pipeline cannot handle mixed args and kwargs")

        if len(kwargs) > 0:
            return DefaultArgumentHandler.handle_kwargs(kwargs)
        else:
            return DefaultArgumentHandler.handle_args(args)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
149
150


151
class PipelineDataFormat:
152
153
154
155
156
    """
    Base class for all the pipeline supported data format both for reading and writing.
    Supported data formats currently includes:
     - JSON
     - CSV
thomwolf's avatar
thomwolf committed
157
     - stdin/stdout (pipe)
158
159
160
161

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
162
163

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
164

165
166
167
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
168
169
        self.output_path = output_path
        self.input_path = input_path
170
        self.column = column.split(",") if column is not None else [""]
171
172
173
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
174
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
175

thomwolf's avatar
thomwolf committed
176
        if output_path is not None and not overwrite:
thomwolf's avatar
thomwolf committed
177
            if exists(abspath(self.output_path)):
178
                raise OSError("{} already exists on disk".format(self.output_path))
179

thomwolf's avatar
thomwolf committed
180
181
        if input_path is not None:
            if not exists(abspath(self.input_path)):
182
                raise OSError("{} doesnt exist on disk".format(self.input_path))
183
184
185
186
187
188
189

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: dict):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
190
191
192
193
194
        """
        Save the provided data object with the representation for the current `DataFormat`.
        :param data: data to store
        :return:
        """
195
196
        raise NotImplementedError()

197
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
198
199
200
201
202
        """
        Save the provided data object as a pickle-formatted binary data on the disk.
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
thomwolf's avatar
thomwolf committed
203
        path, _ = os.path.splitext(self.output_path)
204
        binary_path = os.path.extsep.join((path, "pickle"))
205

206
        with open(binary_path, "wb+") as f_output:
207
208
209
210
            pickle.dump(data, f_output)

        return binary_path

211
    @staticmethod
212
    def from_str(
213
        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
214
215
    ):
        if format == "json":
thomwolf's avatar
thomwolf committed
216
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
217
        elif format == "csv":
thomwolf's avatar
thomwolf committed
218
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
219
        elif format == "pipe":
thomwolf's avatar
thomwolf committed
220
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
221
        else:
222
            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
223
224
225


class CsvPipelineDataFormat(PipelineDataFormat):
226
227
228
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
229
        super().__init__(output_path, input_path, column, overwrite=overwrite)
230
231

    def __iter__(self):
232
        with open(self.input_path, "r") as f:
233
234
235
236
237
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}
                else:
238
                    yield row[self.column[0]]
239
240

    def save(self, data: List[dict]):
241
        with open(self.output_path, "w") as f:
242
243
244
245
246
247
248
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
                writer.writerows(data)


class JsonPipelineDataFormat(PipelineDataFormat):
249
250
251
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
252
        super().__init__(output_path, input_path, column, overwrite=overwrite)
253

254
        with open(input_path, "r") as f:
255
256
257
258
259
260
261
            self._entries = json.load(f)

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}
            else:
262
                yield entry[self.column[0]]
263
264

    def save(self, data: dict):
265
        with open(self.output_path, "w") as f:
266
267
268
            json.dump(data, f)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
269
270
271
272
273
274
275
class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process.
    For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
276

Morgan Funtowicz's avatar
Morgan Funtowicz committed
277
278
279
    def __iter__(self):
        for line in sys.stdin:
            # Split for multi-columns
280
            if "\t" in line:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
281

282
                line = line.split("\t")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
283
284
285
286
287
288
289
290
291
292
293
294
295
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        print(data)

296
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
thomwolf's avatar
thomwolf committed
297
        if self.output_path is None:
298
            raise KeyError(
299
300
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
301
302
303
304
            )

        return super().save_binary(data)

Morgan Funtowicz's avatar
Morgan Funtowicz committed
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319

class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        raise NotImplementedError()


320
class Pipeline(_ScikitCompat):
321
    """
Lysandre Debut's avatar
Lysandre Debut committed
322
323
324
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.

325
326
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
327

328
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Morgan Funtowicz's avatar
Morgan Funtowicz committed
329
330
331
332
333
334
335
336
337
338

    Pipeline supports running on CPU or GPU through the device argument. Users can specify
    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.

    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
    provide the binary_output constructor argument. If set to True, the output will be stored in the
    pickle format.

    Arguments:
339
340
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
341
342
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
343
344
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
345
346
347
348
349
350
351
352
353
354
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
355
            Reference to the object in charge of parsing supplied pipeline parameters.
Lysandre Debut's avatar
Lysandre Debut committed
356
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
357
358
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Lysandre Debut's avatar
Lysandre Debut committed
359
        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
360
361
362
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.

    Return:
Lysandre Debut's avatar
Lysandre Debut committed
363
        :obj:`List` or :obj:`Dict`:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
364
        Pipeline returns list or dictionary depending on:
Lysandre Debut's avatar
Lysandre Debut committed
365
366
367

         - Whether the user supplied multiple samples
         - Whether the pipeline exposes multiple fields in the output object
368
    """
thomwolf's avatar
thomwolf committed
369
370
371

    default_input_names = None

372
373
    def __init__(
        self,
374
375
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
376
        modelcard: Optional[ModelCard] = None,
377
        framework: Optional[str] = None,
378
        task: str = "",
379
380
381
382
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
    ):
383

thomwolf's avatar
thomwolf committed
384
385
386
        if framework is None:
            framework = get_framework()

387
388
        self.model = model
        self.tokenizer = tokenizer
389
        self.modelcard = modelcard
thomwolf's avatar
thomwolf committed
390
        self.framework = framework
391
        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
392
        self.binary_output = binary_output
393
394
        self._args_parser = args_parser or DefaultArgumentHandler()

395
        # Special handling
396
397
        if self.framework == "pt" and self.device.type == "cuda":
            self.model = self.model.to(self.device)
398

399
400
401
402
403
        # Update config with task specific parameters
        task_specific_params = self.model.config.task_specific_params
        if task_specific_params is not None and task in task_specific_params:
            self.model.config.update(task_specific_params.get(task))

404
    def save_pretrained(self, save_directory):
405
406
407
        """
        Save the pipeline's model and tokenizer to the specified save_directory
        """
408
409
        if os.path.isfile(save_directory):
            logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
410
            return
411
        os.makedirs(save_directory, exist_ok=True)
412
413
414

        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)
415
416
        if self.modelcard is not None:
            self.modelcard.save_pretrained(save_directory)
417
418

    def transform(self, X):
419
420
421
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
422
423
424
        return self(X=X)

    def predict(self, X):
425
426
427
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
428
        return self(X=X)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
429

430
431
    @contextmanager
    def device_placement(self):
432
433
434
435
436
437
438
439
440
441
442
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
        example:
            # Explicitly ask for tensor allocation on CUDA device :0
            nlp = pipeline(..., device=0)
            with nlp.device_placement():
                # Every framework specific tensor allocation will be done on the request device
                output = nlp(...)
        Returns:
            Context manager
        """
443
444
        if self.framework == "tf":
            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
445
446
                yield
        else:
447
            if self.device.type == "cuda":
448
                torch.cuda.set_device(self.device)
449

450
            yield
451

452
453
454
455
456
457
458
459
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.
        :param inputs:
        :return:
        """
        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

460
    def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
461
462
463
        """
        Parse arguments and tokenize
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
464
        # Parse arguments
465
        inputs = self._args_parser(*args, **kwargs)
466
467
        inputs = self.tokenizer(
            inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
468
        )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
469

Julien Chaumond's avatar
Julien Chaumond committed
470
471
        return inputs

472
473
    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
474
        return self._forward(inputs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
475

Julien Chaumond's avatar
Julien Chaumond committed
476
    def _forward(self, inputs, return_tensors=False):
477
478
479
480
        """
        Internal framework specific forward dispatching.
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
Julien Chaumond's avatar
Julien Chaumond committed
481
            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
482
483
484
        Returns:
            Numpy array
        """
485
486
487
488
        # Encode for forward
        with self.device_placement():
            if self.framework == "tf":
                # TODO trace model
Funtowicz Morgan's avatar
Funtowicz Morgan committed
489
                predictions = self.model(inputs.data, training=False)[0]
490
491
492
493
            else:
                with torch.no_grad():
                    inputs = self.ensure_tensor_on_device(**inputs)
                    predictions = self.model(**inputs)[0].cpu()
494

Julien Chaumond's avatar
Julien Chaumond committed
495
496
497
498
        if return_tensors:
            return predictions
        else:
            return predictions.numpy()
499
500
501


class FeatureExtractionPipeline(Pipeline):
502
    """
Lysandre Debut's avatar
Lysandre Debut committed
503
    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
504
    which can be used as features in downstream tasks.
Lysandre Debut's avatar
Lysandre Debut committed
505
506
507
508
509
510
511
512
513
514

    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "feature-extraction", for extracting features of a sequence.

    All models may be used for this pipeline. See a list of all models, including community-contributed models on
    `huggingface.co/models <https://huggingface.co/models>`__.

    Arguments:
515
516
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
517
518
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
519
520
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
521
522
523
524
525
526
527
528
529
530
531
532
533
534
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
535
    """
536

537
538
    def __init__(
        self,
539
540
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
541
        modelcard: Optional[ModelCard] = None,
542
543
544
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
545
        task: str = "",
546
547
548
549
550
551
552
553
554
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
555
            task=task,
556
        )
557

558
559
    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs).tolist()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
560
561


562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
class TextGenerationPipeline(Pipeline):
    """
    Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt.

    This language generation pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "text-generation", for generating text from a specified prompt.

    The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective,
    which includes the uni-directional models in the library (e.g. gpt2).
    See the list of available community models on
    `huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
    """

    # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
    # in https://github.com/rusiaaman/XLNet-gen#methodology
    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
580

581
582
583
584
585
586
587
588
589
    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
    (except for Alexei and Maria) are discovered.
    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
    remainder of the story. 1883 Western Siberia,
    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
    Rasputin has a vision and denounces one of the men as a horse thief. Although his
    father initially slaps him for making such an accusation, Rasputin watches as the
    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
590
    with people, even a bishop, begging for his blessing. """
591

592
593
594
595
596
597
598
599
600
601
602
603
604
605
    ALLOWED_MODELS = [
        "XLNetLMHeadModel",
        "TransfoXLLMHeadModel",
        "ReformerModelWithLMHead",
        "GPT2LMHeadModel",
        "OpenAIGPTLMHeadModel",
        "CTRLLMHeadModel",
        "TFXLNetLMHeadModel",
        "TFTransfoXLLMHeadModel",
        "TFGPT2LMHeadModel",
        "TFOpenAIGPTLMHeadModel",
        "TFCTRLLMHeadModel",
    ]

606
    def __call__(
607
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
608
    ):
609
610
611
612
613
614
615
        if self.model.__class__.__name__ not in self.ALLOWED_MODELS:
            raise NotImplementedError(
                "Generation is currently not supported for {}. Please select a model from {} for generation.".format(
                    self.model.__class__.__name__, self.ALLOWED_MODELS
                )
            )

616
        text_inputs = self._args_parser(*args)
617
618
619
620
621
622

        results = []
        for prompt_text in text_inputs:
            # Manage correct placement of the tensors
            with self.device_placement():
                if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]:
623
624
625
626
627
628
629
630
631
632
                    # For XLNet and TransformerXL we had an article to the prompt to give more state to the model.
                    padding_text = self.PADDING_TEXT + self.tokenizer.eos_token
                    padding = self._parse_and_tokenize(padding_text, padding=False, add_special_tokens=False)
                    # This impacts max_length and min_length argument that need adjusting.
                    padding_length = padding["input_ids"].shape[-1]
                    if "max_length" in generate_kwargs and generate_kwargs["max_length"] is not None:
                        generate_kwargs["max_length"] += padding_length
                    if "min_length" in generate_kwargs and generate_kwargs["min_length"] is not None:
                        generate_kwargs["min_length"] += padding_length

633
                    inputs = self._parse_and_tokenize(
634
                        padding_text + prompt_text, padding=False, add_special_tokens=False
635
                    )
636
                else:
637
                    inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False)
638

639
640
641
642
643
644
                # set input_ids to None to allow empty prompt
                if inputs["input_ids"].shape[-1] == 0:
                    inputs["input_ids"] = None
                    inputs["attention_mask"] = None

                if self.framework == "pt" and inputs["input_ids"] is not None:
645
646
647
648
649
650
                    inputs = self.ensure_tensor_on_device(**inputs)

                input_ids = inputs["input_ids"]

                # Ensure that batch size = 1 (batch generation not allowed for now)
                assert (
651
                    input_ids is None or input_ids.shape[0] == 1
652
653
654
655
656
657
                ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information."

                output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs)  # BS x SL

            result = []
            for generated_sequence in output_sequences:
658
                generated_sequence = generated_sequence.numpy().tolist()
659
660
661
662
663
664
665
666
667
668
669
670
                record = {}
                if return_tensors:
                    record["generated_token_ids"] = generated_sequence
                if return_text:
                    # Decode text
                    text = self.tokenizer.decode(
                        generated_sequence,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )

                    # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
671
672
673
674
675
676
677
678
679
680
681
682
                    if input_ids is None:
                        prompt_length = 0
                    else:
                        prompt_length = len(
                            self.tokenizer.decode(
                                input_ids[0],
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                            )
                        )

                    record["generated_text"] = prompt_text + text[prompt_length:]
683
684
685
686
687
688
689
690
691
692

                result.append(record)
            results += [result]

        if len(results) == 1:
            return results[0]

        return results


Morgan Funtowicz's avatar
Morgan Funtowicz committed
693
class TextClassificationPipeline(Pipeline):
694
    """
Lysandre Debut's avatar
Lysandre Debut committed
695
696
697
698
699
700
701
702
703
    Text classification pipeline using ModelForSequenceClassification head. See the
    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.

    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
704
705
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
706
707

    Arguments:
708
709
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
710
711
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
712
713
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
714
715
716
717
718
719
720
721
722
723
724
725
726
727
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
728
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
729

730
731
732
733
734
    def __init__(self, return_all_scores: bool = False, **kwargs):
        super().__init__(**kwargs)

        self.return_all_scores = return_all_scores

735
    def __call__(self, *args, **kwargs):
736
        outputs = super().__call__(*args, **kwargs)
Zhiyu Lin's avatar
Zhiyu Lin committed
737
        scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
738
739
        if self.return_all_scores:
            return [
740
                [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
741
742
743
744
745
746
                for item in scores
            ]
        else:
            return [
                {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
            ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
747
748


Julien Chaumond's avatar
Julien Chaumond committed
749
750
class FillMaskPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
751
752
753
754
755
756
757
758
759
760
    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.

    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "fill-mask", for predicting masked tokens in a sequence.

    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
    which includes the bi-directional models in the library.
761
762
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=lm-head>`__.
Lysandre Debut's avatar
Lysandre Debut committed
763
764

    Arguments:
765
766
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
767
768
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
769
770
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
771
772
773
774
775
776
777
778
779
780
781
782
783
784
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Julien Chaumond's avatar
Julien Chaumond committed
785
786
787
788
    """

    def __init__(
        self,
789
790
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
791
        modelcard: Optional[ModelCard] = None,
Julien Chaumond's avatar
Julien Chaumond committed
792
793
794
795
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        topk=5,
796
        task: str = "",
Julien Chaumond's avatar
Julien Chaumond committed
797
798
799
800
801
802
803
804
805
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
806
            task=task,
Julien Chaumond's avatar
Julien Chaumond committed
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
        )

        self.topk = topk

    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
        outputs = self._forward(inputs, return_tensors=True)

        results = []
        batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)

        for i in range(batch_size):
            input_ids = inputs["input_ids"][i]
            result = []

            if self.framework == "tf":
                masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item()
                logits = outputs[i, masked_index, :]
                probs = tf.nn.softmax(logits)
                topk = tf.math.top_k(probs, k=self.topk)
                values, predictions = topk.values.numpy(), topk.indices.numpy()
            else:
                masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item()
830

Julien Chaumond's avatar
Julien Chaumond committed
831
832
833
834
835
836
837
838
839
                logits = outputs[i, masked_index, :]
                probs = logits.softmax(dim=0)
                values, predictions = probs.topk(self.topk)

            for v, p in zip(values.tolist(), predictions.tolist()):
                tokens = input_ids.numpy()
                tokens[masked_index] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
840
841
842
843
844
845
846
847
                result.append(
                    {
                        "sequence": self.tokenizer.decode(tokens),
                        "score": v,
                        "token": p,
                        "token_str": self.tokenizer.convert_ids_to_tokens(p),
                    }
                )
Julien Chaumond's avatar
Julien Chaumond committed
848
849
850
851
852
853
854
855
856

            # Append
            results += [result]

        if len(results) == 1:
            return results[0]
        return results


857
class TokenClassificationPipeline(Pipeline):
858
    """
Lysandre Debut's avatar
Lysandre Debut committed
859
860
861
862
863
864
865
866
867
    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.

    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.

    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
868
869
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
870
871

    Arguments:
872
873
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
874
875
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
876
877
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
878
879
880
881
882
883
884
885
886
887
888
889
890
891
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
892
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
893

894
895
896
897
    default_input_names = "sequences"

    def __init__(
        self,
898
899
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
900
        modelcard: Optional[ModelCard] = None,
901
902
903
904
905
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
906
        task: str = "",
907
        grouped_entities: bool = False,
908
909
910
911
912
913
914
915
916
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
917
            task=task,
918
        )
919
920

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
921
        self.ignore_labels = ignore_labels
922
        self.grouped_entities = grouped_entities
923

924
925
    def __call__(self, *args, **kwargs):
        inputs = self._args_parser(*args, **kwargs)
Julien Chaumond's avatar
Julien Chaumond committed
926
        answers = []
927
        for sentence in inputs:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
928

929
930
            # Manage correct placement of the tensors
            with self.device_placement():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
931

932
933
                tokens = self.tokenizer(
                    sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
934
                )
935
936

                # Forward
937
                if self.framework == "tf":
Funtowicz Morgan's avatar
Funtowicz Morgan committed
938
                    entities = self.model(tokens.data)[0][0].numpy()
939
                    input_ids = tokens["input_ids"].numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
940
                else:
941
                    with torch.no_grad():
942
                        tokens = self.ensure_tensor_on_device(**tokens)
943
                        entities = self.model(**tokens)[0][0].cpu().numpy()
944
                        input_ids = tokens["input_ids"].cpu().numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
945

thomwolf's avatar
thomwolf committed
946
947
            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
948

949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
            entities = []
            entity_groups = []
            entity_group_disagg = []
            # Filter to labels not in `self.ignore_labels`
            filtered_labels_idx = [
                (idx, label_idx)
                for idx, label_idx in enumerate(labels_idx)
                if self.model.config.id2label[label_idx] not in self.ignore_labels
            ]

            for idx, label_idx in filtered_labels_idx:

                entity = {
                    "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
                    "score": score[idx][label_idx].item(),
                    "entity": self.model.config.id2label[label_idx],
                    "index": idx,
                }
                last_idx, _ = filtered_labels_idx[-1]
                if self.grouped_entities:
                    if not entity_group_disagg:
                        entity_group_disagg += [entity]
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                        continue

                    # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
                    if (
                        entity["entity"] == entity_group_disagg[-1]["entity"]
                        and entity["index"] == entity_group_disagg[-1]["index"] + 1
                    ):
                        entity_group_disagg += [entity]
                        # Group the entities at the last entity
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                    # If the current entity is different from the previous entity, aggregate the disaggregated entity group
                    else:
                        entity_groups += [self.group_entities(entity_group_disagg)]
                        entity_group_disagg = [entity]

                entities += [entity]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
990
991

            # Append
992
993
994
995
996
            if self.grouped_entities:
                answers += [entity_groups]
            else:
                answers += [entities]

thomwolf's avatar
thomwolf committed
997
998
        if len(answers) == 1:
            return answers[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
999
1000
        return answers

1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
    def group_entities(self, entities):
        """
        Returns grouped entities
        """
        # Get the last entity in the entity group
        entity = entities[-1]["entity"]
        scores = np.mean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
        }
        return entity_group

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1017

1018
NerPipeline = TokenClassificationPipeline
1019
1020


1021
1022
1023
1024
1025
1026
1027
1028
class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
    to internal SquadExample / SquadFeature structures.

    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
    arguments.
    """
1029

1030
1031
1032
1033
    def __call__(self, *args, **kwargs):
        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
        if args is not None and len(args) > 0:
            if len(args) == 1:
1034
                kwargs["X"] = args[0]
1035
            else:
1036
                kwargs["X"] = list(args)
1037

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1038
1039
        # Generic compatibility with sklearn and Keras
        # Batched data
1040
1041
        if "X" in kwargs or "data" in kwargs:
            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
1042

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1043
1044
1045
1046
1047
            if isinstance(inputs, dict):
                inputs = [inputs]
            else:
                # Copy to avoid overriding arguments
                inputs = [i for i in inputs]
1048

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1049
            for i, item in enumerate(inputs):
1050
                if isinstance(item, dict):
1051
1052
                    if any(k not in item for k in ["question", "context"]):
                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
1053

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1054
1055
1056
                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)

                elif not isinstance(item, SquadExample):
1057
                    raise ValueError(
1058
1059
1060
                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
                            "X" if "X" in kwargs else "data"
                        )
1061
1062
1063
                    )

            # Tabular input
1064
1065
1066
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], str):
                kwargs["question"] = [kwargs["question"]]
1067

1068
1069
            if isinstance(kwargs["context"], str):
                kwargs["context"] = [kwargs["context"]]
1070

1071
1072
1073
            inputs = [
                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
            ]
1074
        else:
1075
            raise ValueError("Unknown arguments {}".format(kwargs))
1076
1077
1078
1079
1080
1081
1082

        if not isinstance(inputs, list):
            inputs = [inputs]

        return inputs


Morgan Funtowicz's avatar
Morgan Funtowicz committed
1083
1084
class QuestionAnsweringPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
1085
1086
1087
1088
1089
1090
1091
1092
1093
    Question Answering pipeline using ModelForQuestionAnswering head. See the
    `question answering usage <../usage.html#question-answering>`__ examples for more information.

    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "question-answering", for answering questions given a context.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
1094
1095
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
Lysandre Debut's avatar
Lysandre Debut committed
1096
1097

    Arguments:
1098
1099
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1100
1101
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
1102
1103
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1118
1119
    """

1120
1121
1122
1123
    default_input_names = "question,context"

    def __init__(
        self,
1124
1125
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
1126
        modelcard: Optional[ModelCard] = None,
1127
1128
        framework: Optional[str] = None,
        device: int = -1,
1129
        task: str = "",
1130
1131
1132
1133
1134
1135
1136
1137
1138
        **kwargs
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=QuestionAnsweringArgumentHandler(),
            device=device,
1139
            task=task,
1140
            **kwargs,
1141
        )
thomwolf's avatar
thomwolf committed
1142

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1143
    @staticmethod
1144
1145
1146
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
1147
1148
1149
1150
        """
        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
        We currently support extractive question answering.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1151
        Arguments:
1152
1153
             question: (str, List[str]) The question to be ask for the associated context
             context: (str, List[str]) The context in which we will look for the answer.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1154
1155
1156

        Returns:
            SquadExample initialized with the corresponding question and context.
1157
1158
        """
        if isinstance(question, list):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1159
1160
1161
1162
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)

1163
    def __call__(self, *args, **kwargs):
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the intial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1178
        # Set defaults values
1179
1180
1181
1182
1183
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
1184
        kwargs.setdefault("handle_impossible_answer", False)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1185

1186
1187
        if kwargs["topk"] < 1:
            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1188

1189
1190
        if kwargs["max_answer_len"] < 1:
            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1191
1192

        # Convert inputs to features
1193
        examples = self._args_parser(*args, **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1194
1195
        features_list = [
            squad_convert_examples_to_features(
1196
1197
1198
1199
1200
1201
                examples=[example],
                tokenizer=self.tokenizer,
                max_seq_length=kwargs["max_seq_len"],
                doc_stride=kwargs["doc_stride"],
                max_query_length=kwargs["max_question_len"],
                is_training=False,
1202
                tqdm_enabled=False,
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1203
1204
1205
            )
            for example in examples
        ]
Rishabh Manoj's avatar
Rishabh Manoj committed
1206
1207
        all_answers = []
        for features, example in zip(features_list, examples):
Patrick von Platen's avatar
Patrick von Platen committed
1208
1209
            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
            fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
Rishabh Manoj's avatar
Rishabh Manoj committed
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223

            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                    start, end = self.model(fw_args)
                    start, end = start.numpy(), end.numpy()
                else:
                    with torch.no_grad():
                        # Retrieve the score for the context tokens only (removing question tokens)
                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
                        start, end = self.model(**fw_args)
                        start, end = start.cpu().numpy(), end.cpu().numpy()

1224
            min_null_score = 1000000  # large and positive
Rishabh Manoj's avatar
Rishabh Manoj committed
1225
1226
1227
1228
1229
1230
1231
            answers = []
            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1232
1233
1234
1235
                start_, end_ = (
                    start_ * np.abs(np.array(feature.p_mask) - 1),
                    end_ * np.abs(np.array(feature.p_mask) - 1),
                )
Rishabh Manoj's avatar
Rishabh Manoj committed
1236

1237
1238
1239
                if kwargs["handle_impossible_answer"]:
                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())

Rishabh Manoj's avatar
Rishabh Manoj committed
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
                start_[0] = end_[0] = 0

                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                char_to_word = np.array(example.char_to_word_offset)

                # Convert the answer (tokens) back to the original text
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                        ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
1257
1258
1259
1260

            if kwargs["handle_impossible_answer"]:
                answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1261
1262
1263
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
            all_answers += answers

Rishabh Manoj's avatar
Rishabh Manoj committed
1264
        if len(all_answers) == 1:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1265
            return all_answers[0]
Rishabh Manoj's avatar
Rishabh Manoj committed
1266
        return all_answers
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1267
1268

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
        """
        Take the output of any QuestionAnswering head and will generate probalities for each span to be
        the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than
        max_answer_len or answer end position being before the starting position.
        The method supports output the k-best answer through the topk argument.

        Args:
            start: numpy array, holding individual start probabilities for each token
            end: numpy array, holding individual end probabilities for each token
            topk: int, indicates how many possible answer span(s) to extract from the model's output
            max_answer_len: int, maximum size of the answer to extract from the model's output
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), max_answer_len - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if topk == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < topk:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, topk)[0:topk]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end]

    def span_to_answer(self, text: str, start: int, end: int):
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
        """
        When decoding from token probalities, this method maps token indexes to actual word in
        the initial context.

        Args:
            text: str, the actual context to extract the answer from
            start: int, starting answer token index
            end: int, ending answer token index

        Returns:
            dict: {'answer': str, 'start': int, 'end': int}
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # Append words if they are in the span
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)

                words += [word]

            # Stop if we went over the end of the answer
            if token_idx > end:
                break

            # Append the subtokenization length to the running index
            token_idx += len(token)
            chars_idx += len(word) + 1

        # Join text with spaces
1346
1347
1348
1349
1350
        return {
            "answer": " ".join(words),
            "start": max(0, char_start_idx),
            "end": min(len(text), char_end_idx),
        }
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1351
1352


1353
1354
1355
1356
1357
1358
class SummarizationPipeline(Pipeline):
    """
    Summarize news articles and other documents

    Usage::

1359
        # use bart in pytorch
1360
        summarizer = pipeline("summarization")
1361
1362
1363
1364
1365
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)

        # use t5 in tf
        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
1366

1367
1368
1369
1370
    The models that this pipeline can use are models that have been fine-tuned on a summarization task,
    which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.

            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

1401
1402
1403
1404
    def __init__(self, **kwargs):
        kwargs.update(task="summarization")
        super().__init__(**kwargs)

1405
    def __call__(
1406
        self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
    ):
        r"""
        Args:
            *documents: (list of strings) articles to be summarized
            return_text: (bool, default=True) whether to add a decoded "summary_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result

            clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output
            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize

        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate

        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
        assert len(documents) > 0, "Please provide a document to summarize"

        if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__:
            raise NotImplementedError(
                "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`"
            )

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

        if isinstance(documents[0], list):
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"

            documents = ([prefix + document for document in documents[0]],)
1440
            padding = True
1441
1442
1443

        elif isinstance(documents[0], str):
            documents = (prefix + documents[0],)
1444
            padding = False
1445
1446
1447
1448
1449
1450
1451
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
                    documents[0]
                )
            )

1452
        with self.device_placement():
1453
            inputs = self._parse_and_tokenize(*documents, padding=padding)
1454
1455
1456
1457
1458

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]
            elif self.framework == "tf":
1459
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()
1460

1461
1462
            min_length = generate_kwargs.get("min_length", self.model.config.min_length)
            if input_length < min_length // 2:
1463
                logger.warning(
1464
                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
1465
                        min_length, input_length
1466
1467
1468
                    )
                )

1469
1470
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length < max_length:
1471
                logger.warning(
1472
                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
1473
                        max_length, input_length
1474
1475
1476
                    )
                )

1477
            summaries = self.model.generate(
1478
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
1479
            )
1480

1481
1482
1483
1484
1485
1486
1487
            results = []
            for summary in summaries:
                record = {}
                if return_tensors:
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results


class TranslationPipeline(Pipeline):
    """
    Translates from one language to another.

    Usage::
        en_fr_translator = pipeline("translation_en_to_fr")
        en_fr_translator("How old are you?")

1502
1503
1504
1505
    The models that this pipeline can use are models that have been fine-tuned on a translation task,
    currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.
            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.
            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
1534
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1535
1536
1537
    ):
        r"""
        Args:
1538
            *args: (list of strings) texts to be translated
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
            return_text: (bool, default=True) whether to add a decoded "translation_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result

            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate
        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate
        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

1553
        if isinstance(args[0], list):
1554
1555
1556
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
1557
            args = ([prefix + text for text in args[0]],)
1558
            padding = True
1559

1560
1561
        elif isinstance(args[0], str):
            args = (prefix + args[0],)
1562
            padding = False
1563
1564
1565
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
1566
                    args[0]
1567
1568
1569
1570
                )
            )

        with self.device_placement():
1571
            inputs = self._parse_and_tokenize(*args, padding=padding)
1572
1573
1574
1575
1576
1577
1578
1579

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]

            elif self.framework == "tf":
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()

1580
1581
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length > 0.9 * max_length:
1582
1583
                logger.warning(
                    "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
1584
                        input_length, max_length
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
                    )
                )

            translations = self.model.generate(
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
            )
            results = []
            for translation in translations:
                record = {}
                if return_tensors:
                    record["translation_token_ids"] = translation
                if return_text:
                    record["translation_text"] = self.tokenizer.decode(
                        translation,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1601
1602
1603
1604
1605
                    )
                results.append(record)
            return results


1606
# Register all the supported tasks here
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1607
SUPPORTED_TASKS = {
1608
1609
1610
1611
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
        "tf": TFAutoModel if is_tf_available() else None,
        "pt": AutoModel if is_torch_available() else None,
1612
        "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}},
1613
    },
1614
1615
1616
1617
1618
1619
    "sentiment-analysis": {
        "impl": TextClassificationPipeline,
        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
        "default": {
            "model": {
1620
1621
                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
1622
            },
1623
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1624
    },
1625
    "ner": {
1626
        "impl": TokenClassificationPipeline,
1627
1628
1629
1630
        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
        "pt": AutoModelForTokenClassification if is_torch_available() else None,
        "default": {
            "model": {
Julien Chaumond's avatar
Julien Chaumond committed
1631
1632
                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
1633
            },
1634
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1635
    },
1636
1637
1638
1639
1640
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
        "default": {
Lysandre's avatar
E231  
Lysandre committed
1641
            "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
1642
1643
        },
    },
Julien Chaumond's avatar
Julien Chaumond committed
1644
1645
1646
1647
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1648
        "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}},
Julien Chaumond's avatar
Julien Chaumond committed
1649
    },
1650
1651
    "summarization": {
        "impl": SummarizationPipeline,
1652
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
1653
1654
        "pt": AutoModelForSeq2SeqLM if is_torch_available() else None,
        "default": {"model": {"pt": "sshleifer/distilbart-cnn-12-6", "tf": "t5-small"}},
1655
    },
1656
1657
1658
1659
    "translation_en_to_fr": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1660
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1661
1662
1663
1664
1665
    },
    "translation_en_to_de": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1666
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1667
1668
1669
1670
1671
    },
    "translation_en_to_ro": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1672
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1673
    },
1674
1675
1676
1677
    "text-generation": {
        "impl": TextGenerationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1678
        "default": {"model": {"pt": "gpt2", "tf": "gpt2"}},
1679
    },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1680
1681
1682
}


1683
1684
1685
1686
1687
def pipeline(
    task: str,
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
1688
    framework: Optional[str] = None,
1689
1690
    **kwargs
) -> Pipeline:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1691
    """
1692
    Utility factory method to build a pipeline.
Lysandre Debut's avatar
Lysandre Debut committed
1693

1694
    Pipeline are made of:
1695

Lysandre Debut's avatar
Lysandre Debut committed
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
        - A Tokenizer instance in charge of mapping raw textual input to token
        - A Model instance
        - Some (optional) post processing for enhancing model's output


    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:

            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
1707
            - "ner": will return a :class:`~transformers.TokenClassificationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1708
1709
            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
1710
1711
            - "summarization": will return a :class:`~transformers.SummarizationPipeline`
            - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline`
1712
            - "text-generation": will return a :class:`~transformers.TextGenerationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1713
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
1714
1715
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`,
            a model identifier or an actual pre-trained model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1716
1717
1718
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

1719
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1720
1721
        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
1722
            a model identifier or an actual pre-trained model configuration inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1723
1724
            :class:`~transformers.PretrainedConfig`.

1725
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1726
1727
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
1728
            a model identifier or an actual pre-trained tokenizer inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1729
1730
            :class:`~transformers.PreTrainedTokenizer`.

1731
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.

    Returns:
        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
        the task.

    Examples::

        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

        # Sentiment analysis pipeline
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1748
        pipeline('sentiment-analysis')
Lysandre Debut's avatar
Lysandre Debut committed
1749
1750

        # Question answering pipeline, specifying the checkpoint identifier
1751
        pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
Lysandre Debut's avatar
Lysandre Debut committed
1752
1753
1754
1755
1756

        # Named entity recognition pipeline, passing in a specific model and tokenizer
        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        pipeline('ner', model=model, tokenizer=tokenizer)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1757
    """
1758
    # Retrieve the task
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1759
1760
1761
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

1762
    framework = framework or get_framework(model)
1763

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1764
    targeted_task = SUPPORTED_TASKS[task]
1765
    task_class, model_class = targeted_task["impl"], targeted_task[framework]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1766

1767
    # Use default model/config/tokenizer for the task if no model is provided
1768
    if model is None:
1769
        model = targeted_task["default"]["model"][framework]
1770

1771
1772
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
1773
        if isinstance(model, str):
1774
            tokenizer = model
1775
        elif isinstance(config, str):
1776
1777
1778
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
1779
1780
            raise Exception(
                "Impossible to guess which tokenizer to use. "
1781
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
1782
            )
1783

Lysandre Debut's avatar
Lysandre Debut committed
1784
    modelcard = None
1785
    # Try to infer modelcard from model or config name (if provided as str)
Lysandre Debut's avatar
Lysandre Debut committed
1786
1787
1788
1789
    if isinstance(model, str):
        modelcard = model
    elif isinstance(config, str):
        modelcard = config
1790
1791

    # Instantiate tokenizer if needed
1792
1793
1794
1795
1796
1797
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
1798
1799
1800
1801

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)
1802

thomwolf's avatar
thomwolf committed
1803
1804
1805
1806
    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

1807
    # Instantiate model if needed
1808
    if isinstance(model, str):
1809
1810
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch."
            )
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow."
            )
1823
        model = model_class.from_pretrained(model, config=config, **model_kwargs)
1824

1825
    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)