pipelines.py 80.2 KB
Newer Older
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
import csv
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import logging
Morgan Funtowicz's avatar
Morgan Funtowicz committed
20
import os
21
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
22
import sys
Morgan Funtowicz's avatar
Morgan Funtowicz committed
23
from abc import ABC, abstractmethod
24
from contextlib import contextmanager
25
from itertools import chain
26
from os.path import abspath, exists
27
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
Morgan Funtowicz's avatar
Morgan Funtowicz committed
28
29
30

import numpy as np

31
from .configuration_auto import AutoConfig
32
33
34
35
36
37
38
from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
Morgan Funtowicz's avatar
Morgan Funtowicz committed
39

Aymeric Augustin's avatar
Aymeric Augustin committed
40

Morgan Funtowicz's avatar
Morgan Funtowicz committed
41
if is_tf_available():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
42
    import tensorflow as tf
43
    from .modeling_tf_auto import (
44
45
46
47
        TFAutoModel,
        TFAutoModelForSequenceClassification,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
48
        TFAutoModelWithLMHead,
49
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
50
51
52

if is_torch_available():
    import torch
53
    from .modeling_auto import (
54
55
56
57
        AutoModel,
        AutoModelForSequenceClassification,
        AutoModelForQuestionAnswering,
        AutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
58
        AutoModelWithLMHead,
59
        AutoModelForSeq2SeqLM,
60
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
61

62
63
64
65
if TYPE_CHECKING:
    from .modeling_utils import PreTrainedModel
    from .modeling_tf_utils import TFPreTrainedModel

Morgan Funtowicz's avatar
Morgan Funtowicz committed
66

67
68
logger = logging.getLogger(__name__)

69

thomwolf's avatar
thomwolf committed
70
def get_framework(model=None):
71
    """ Select framework (TensorFlow/PyTorch) to use.
72
        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
73
    """
thomwolf's avatar
thomwolf committed
74
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
Julien Chaumond's avatar
Julien Chaumond committed
75
        # Both framework are available but the user supplied a model class instance.
thomwolf's avatar
thomwolf committed
76
        # Try to guess which framework to use from the model classname
77
        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
78
    elif not is_tf_available() and not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
79
        raise RuntimeError(
80
81
82
83
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
84
    else:
85
        # framework = 'tf' if is_tf_available() else 'pt'
86
        framework = "pt" if is_torch_available() else "tf"
thomwolf's avatar
thomwolf committed
87
88
    return framework

89

90
91
92
93
94
95
96
97
98
99
100
101
class PipelineException(Exception):
    """
    Raised by pipelines when handling __call__
    """

    def __init__(self, task: str, model: str, reason: str):
        super().__init__(reason)

        self.task = task
        self.model = model


102
103
104
105
class ArgumentHandler(ABC):
    """
    Base interface for handling varargs for each Pipeline
    """
106

107
108
109
    @abstractmethod
    def __call__(self, *args, **kwargs):
        raise NotImplementedError()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
110
111


112
113
114
115
class DefaultArgumentHandler(ArgumentHandler):
    """
    Default varargs argument parser handling parameters for each Pipeline
    """
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    @staticmethod
    def handle_kwargs(kwargs: Dict) -> List:
        if len(kwargs) == 1:
            output = list(kwargs.values())
        else:
            output = list(chain(kwargs.values()))

        return DefaultArgumentHandler.handle_args(output)

    @staticmethod
    def handle_args(args: Sequence[Any]) -> List[str]:

        # Only one argument, let's do case by case
        if len(args) == 1:
            if isinstance(args[0], str):
132
                return [args[0]]
133
134
135
136
137
138
            elif not isinstance(args[0], list):
                return list(args)
            else:
                return args[0]

        # Multiple arguments (x1, x2, ...)
139
        elif len(args) > 1:
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
            if all([isinstance(arg, str) for arg in args]):
                return list(args)

            # If not instance of list, then it should instance of iterable
            elif isinstance(args, Iterable):
                return list(chain.from_iterable(chain(args)))
            else:
                raise ValueError(
                    "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args))
                )
        else:
            return []

    def __call__(self, *args, **kwargs):
        if len(kwargs) > 0 and len(args) > 0:
            raise ValueError("Pipeline cannot handle mixed args and kwargs")

        if len(kwargs) > 0:
            return DefaultArgumentHandler.handle_kwargs(kwargs)
        else:
            return DefaultArgumentHandler.handle_args(args)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
161
162


163
class PipelineDataFormat:
164
165
166
167
168
    """
    Base class for all the pipeline supported data format both for reading and writing.
    Supported data formats currently includes:
     - JSON
     - CSV
thomwolf's avatar
thomwolf committed
169
     - stdin/stdout (pipe)
170
171
172
173

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
174
175

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
176

177
178
179
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
180
181
        self.output_path = output_path
        self.input_path = input_path
182
        self.column = column.split(",") if column is not None else [""]
183
184
185
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
186
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
187

thomwolf's avatar
thomwolf committed
188
        if output_path is not None and not overwrite:
thomwolf's avatar
thomwolf committed
189
            if exists(abspath(self.output_path)):
190
                raise OSError("{} already exists on disk".format(self.output_path))
191

thomwolf's avatar
thomwolf committed
192
193
        if input_path is not None:
            if not exists(abspath(self.input_path)):
194
                raise OSError("{} doesnt exist on disk".format(self.input_path))
195
196
197
198
199
200
201

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: dict):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
202
203
204
205
206
        """
        Save the provided data object with the representation for the current `DataFormat`.
        :param data: data to store
        :return:
        """
207
208
        raise NotImplementedError()

209
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
210
211
212
213
214
        """
        Save the provided data object as a pickle-formatted binary data on the disk.
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
thomwolf's avatar
thomwolf committed
215
        path, _ = os.path.splitext(self.output_path)
216
        binary_path = os.path.extsep.join((path, "pickle"))
217

218
        with open(binary_path, "wb+") as f_output:
219
220
221
222
            pickle.dump(data, f_output)

        return binary_path

223
    @staticmethod
224
    def from_str(
225
        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
226
227
    ):
        if format == "json":
thomwolf's avatar
thomwolf committed
228
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
229
        elif format == "csv":
thomwolf's avatar
thomwolf committed
230
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
231
        elif format == "pipe":
thomwolf's avatar
thomwolf committed
232
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
233
        else:
234
            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
235
236
237


class CsvPipelineDataFormat(PipelineDataFormat):
238
239
240
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
241
        super().__init__(output_path, input_path, column, overwrite=overwrite)
242
243

    def __iter__(self):
244
        with open(self.input_path, "r") as f:
245
246
247
248
249
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}
                else:
250
                    yield row[self.column[0]]
251
252

    def save(self, data: List[dict]):
253
        with open(self.output_path, "w") as f:
254
255
256
257
258
259
260
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
                writer.writerows(data)


class JsonPipelineDataFormat(PipelineDataFormat):
261
262
263
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
264
        super().__init__(output_path, input_path, column, overwrite=overwrite)
265

266
        with open(input_path, "r") as f:
267
268
269
270
271
272
273
            self._entries = json.load(f)

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}
            else:
274
                yield entry[self.column[0]]
275
276

    def save(self, data: dict):
277
        with open(self.output_path, "w") as f:
278
279
280
            json.dump(data, f)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
281
282
283
284
285
286
287
class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process.
    For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
288

Morgan Funtowicz's avatar
Morgan Funtowicz committed
289
290
291
    def __iter__(self):
        for line in sys.stdin:
            # Split for multi-columns
292
            if "\t" in line:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
293

294
                line = line.split("\t")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
295
296
297
298
299
300
301
302
303
304
305
306
307
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        print(data)

308
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
thomwolf's avatar
thomwolf committed
309
        if self.output_path is None:
310
            raise KeyError(
311
312
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
313
314
315
316
            )

        return super().save_binary(data)

Morgan Funtowicz's avatar
Morgan Funtowicz committed
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        raise NotImplementedError()


332
class Pipeline(_ScikitCompat):
333
    """
Lysandre Debut's avatar
Lysandre Debut committed
334
335
336
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.

337
338
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
339

340
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Morgan Funtowicz's avatar
Morgan Funtowicz committed
341
342
343
344
345
346
347
348
349
350

    Pipeline supports running on CPU or GPU through the device argument. Users can specify
    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.

    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
    provide the binary_output constructor argument. If set to True, the output will be stored in the
    pickle format.

    Arguments:
351
352
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
353
354
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
355
356
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
357
358
359
360
361
362
363
364
365
366
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
367
            Reference to the object in charge of parsing supplied pipeline parameters.
Lysandre Debut's avatar
Lysandre Debut committed
368
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
369
370
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Lysandre Debut's avatar
Lysandre Debut committed
371
        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
372
373
374
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.

    Return:
Lysandre Debut's avatar
Lysandre Debut committed
375
        :obj:`List` or :obj:`Dict`:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
376
        Pipeline returns list or dictionary depending on:
Lysandre Debut's avatar
Lysandre Debut committed
377
378
379

         - Whether the user supplied multiple samples
         - Whether the pipeline exposes multiple fields in the output object
380
    """
thomwolf's avatar
thomwolf committed
381
382
383

    default_input_names = None

384
385
    def __init__(
        self,
386
387
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
388
        modelcard: Optional[ModelCard] = None,
389
        framework: Optional[str] = None,
390
        task: str = "",
391
392
393
394
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
    ):
395

thomwolf's avatar
thomwolf committed
396
397
398
        if framework is None:
            framework = get_framework()

399
400
        self.model = model
        self.tokenizer = tokenizer
401
        self.modelcard = modelcard
thomwolf's avatar
thomwolf committed
402
        self.framework = framework
403
        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
404
        self.binary_output = binary_output
405
406
        self._args_parser = args_parser or DefaultArgumentHandler()

407
        # Special handling
408
409
        if self.framework == "pt" and self.device.type == "cuda":
            self.model = self.model.to(self.device)
410

411
412
413
414
415
        # Update config with task specific parameters
        task_specific_params = self.model.config.task_specific_params
        if task_specific_params is not None and task in task_specific_params:
            self.model.config.update(task_specific_params.get(task))

416
    def save_pretrained(self, save_directory):
417
418
419
        """
        Save the pipeline's model and tokenizer to the specified save_directory
        """
420
421
        if os.path.isfile(save_directory):
            logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
422
            return
423
        os.makedirs(save_directory, exist_ok=True)
424
425
426

        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)
427
428
        if self.modelcard is not None:
            self.modelcard.save_pretrained(save_directory)
429
430

    def transform(self, X):
431
432
433
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
434
435
436
        return self(X=X)

    def predict(self, X):
437
438
439
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
440
        return self(X=X)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
441

442
443
    @contextmanager
    def device_placement(self):
444
445
446
447
448
449
450
451
452
453
454
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
        example:
            # Explicitly ask for tensor allocation on CUDA device :0
            nlp = pipeline(..., device=0)
            with nlp.device_placement():
                # Every framework specific tensor allocation will be done on the request device
                output = nlp(...)
        Returns:
            Context manager
        """
455
456
        if self.framework == "tf":
            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
457
458
                yield
        else:
459
            if self.device.type == "cuda":
460
                torch.cuda.set_device(self.device)
461

462
            yield
463

464
465
466
467
468
469
470
471
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.
        :param inputs:
        :return:
        """
        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

472
    def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
473
474
475
        """
        Parse arguments and tokenize
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
476
        # Parse arguments
477
        inputs = self._args_parser(*args, **kwargs)
478
479
        inputs = self.tokenizer(
            inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
480
        )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
481

Julien Chaumond's avatar
Julien Chaumond committed
482
483
        return inputs

484
485
    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
486
        return self._forward(inputs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
487

Julien Chaumond's avatar
Julien Chaumond committed
488
    def _forward(self, inputs, return_tensors=False):
489
490
491
492
        """
        Internal framework specific forward dispatching.
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
Julien Chaumond's avatar
Julien Chaumond committed
493
            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
494
495
496
        Returns:
            Numpy array
        """
497
498
499
500
        # Encode for forward
        with self.device_placement():
            if self.framework == "tf":
                # TODO trace model
Funtowicz Morgan's avatar
Funtowicz Morgan committed
501
                predictions = self.model(inputs.data, training=False)[0]
502
503
504
505
            else:
                with torch.no_grad():
                    inputs = self.ensure_tensor_on_device(**inputs)
                    predictions = self.model(**inputs)[0].cpu()
506

Julien Chaumond's avatar
Julien Chaumond committed
507
508
509
510
        if return_tensors:
            return predictions
        else:
            return predictions.numpy()
511
512
513


class FeatureExtractionPipeline(Pipeline):
514
    """
Lysandre Debut's avatar
Lysandre Debut committed
515
    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
516
    which can be used as features in downstream tasks.
Lysandre Debut's avatar
Lysandre Debut committed
517
518
519
520
521
522
523
524
525
526

    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "feature-extraction", for extracting features of a sequence.

    All models may be used for this pipeline. See a list of all models, including community-contributed models on
    `huggingface.co/models <https://huggingface.co/models>`__.

    Arguments:
527
528
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
529
530
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
531
532
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
533
534
535
536
537
538
539
540
541
542
543
544
545
546
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
547
    """
548

549
550
    def __init__(
        self,
551
552
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
553
        modelcard: Optional[ModelCard] = None,
554
555
556
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
557
        task: str = "",
558
559
560
561
562
563
564
565
566
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
567
            task=task,
568
        )
569

570
571
    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs).tolist()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
572
573


574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
class TextGenerationPipeline(Pipeline):
    """
    Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt.

    This language generation pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "text-generation", for generating text from a specified prompt.

    The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective,
    which includes the uni-directional models in the library (e.g. gpt2).
    See the list of available community models on
    `huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
    """

    # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
    # in https://github.com/rusiaaman/XLNet-gen#methodology
    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
592

593
594
595
596
597
598
599
600
601
    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
    (except for Alexei and Maria) are discovered.
    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
    remainder of the story. 1883 Western Siberia,
    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
    Rasputin has a vision and denounces one of the men as a horse thief. Although his
    father initially slaps him for making such an accusation, Rasputin watches as the
    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
602
    with people, even a bishop, begging for his blessing. """
603

604
605
606
607
608
609
610
611
612
613
614
615
616
617
    ALLOWED_MODELS = [
        "XLNetLMHeadModel",
        "TransfoXLLMHeadModel",
        "ReformerModelWithLMHead",
        "GPT2LMHeadModel",
        "OpenAIGPTLMHeadModel",
        "CTRLLMHeadModel",
        "TFXLNetLMHeadModel",
        "TFTransfoXLLMHeadModel",
        "TFGPT2LMHeadModel",
        "TFOpenAIGPTLMHeadModel",
        "TFCTRLLMHeadModel",
    ]

618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
    # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments

    def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs):
        """
        Parse arguments and tokenize
        """
        # Parse arguments
        if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
            tokenizer_kwargs = {"add_space_before_punct_symbol": True}
        else:
            tokenizer_kwargs = {}
        inputs = self._args_parser(*args, **kwargs)
        inputs = self.tokenizer(
            inputs,
            add_special_tokens=add_special_tokens,
            return_tensors=self.framework,
            padding=padding,
            **tokenizer_kwargs,
        )

        return inputs

640
    def __call__(
641
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
642
    ):
643
644
645
646
647
648
649
        if self.model.__class__.__name__ not in self.ALLOWED_MODELS:
            raise NotImplementedError(
                "Generation is currently not supported for {}. Please select a model from {} for generation.".format(
                    self.model.__class__.__name__, self.ALLOWED_MODELS
                )
            )

650
        text_inputs = self._args_parser(*args)
651
652
653
654
655
656

        results = []
        for prompt_text in text_inputs:
            # Manage correct placement of the tensors
            with self.device_placement():
                if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]:
657
658
659
660
661
662
663
664
665
666
                    # For XLNet and TransformerXL we had an article to the prompt to give more state to the model.
                    padding_text = self.PADDING_TEXT + self.tokenizer.eos_token
                    padding = self._parse_and_tokenize(padding_text, padding=False, add_special_tokens=False)
                    # This impacts max_length and min_length argument that need adjusting.
                    padding_length = padding["input_ids"].shape[-1]
                    if "max_length" in generate_kwargs and generate_kwargs["max_length"] is not None:
                        generate_kwargs["max_length"] += padding_length
                    if "min_length" in generate_kwargs and generate_kwargs["min_length"] is not None:
                        generate_kwargs["min_length"] += padding_length

667
                    inputs = self._parse_and_tokenize(
668
                        padding_text + prompt_text, padding=False, add_special_tokens=False
669
                    )
670
                else:
671
                    inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False)
672

673
674
675
676
677
678
                # set input_ids to None to allow empty prompt
                if inputs["input_ids"].shape[-1] == 0:
                    inputs["input_ids"] = None
                    inputs["attention_mask"] = None

                if self.framework == "pt" and inputs["input_ids"] is not None:
679
680
681
682
683
684
                    inputs = self.ensure_tensor_on_device(**inputs)

                input_ids = inputs["input_ids"]

                # Ensure that batch size = 1 (batch generation not allowed for now)
                assert (
685
                    input_ids is None or input_ids.shape[0] == 1
686
687
688
689
690
691
                ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information."

                output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs)  # BS x SL

            result = []
            for generated_sequence in output_sequences:
692
                generated_sequence = generated_sequence.numpy().tolist()
693
694
695
696
697
698
699
700
701
702
703
704
                record = {}
                if return_tensors:
                    record["generated_token_ids"] = generated_sequence
                if return_text:
                    # Decode text
                    text = self.tokenizer.decode(
                        generated_sequence,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )

                    # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
705
706
707
708
709
710
711
712
713
714
715
716
                    if input_ids is None:
                        prompt_length = 0
                    else:
                        prompt_length = len(
                            self.tokenizer.decode(
                                input_ids[0],
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                            )
                        )

                    record["generated_text"] = prompt_text + text[prompt_length:]
717
718
719
720
721
722
723
724
725
726

                result.append(record)
            results += [result]

        if len(results) == 1:
            return results[0]

        return results


Morgan Funtowicz's avatar
Morgan Funtowicz committed
727
class TextClassificationPipeline(Pipeline):
728
    """
Lysandre Debut's avatar
Lysandre Debut committed
729
730
731
732
733
734
735
736
737
    Text classification pipeline using ModelForSequenceClassification head. See the
    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.

    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
738
739
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
740
741

    Arguments:
742
743
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
744
745
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
746
747
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
748
749
750
751
752
753
754
755
756
757
758
759
760
761
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
762
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
763

764
765
766
767
768
    def __init__(self, return_all_scores: bool = False, **kwargs):
        super().__init__(**kwargs)

        self.return_all_scores = return_all_scores

769
    def __call__(self, *args, **kwargs):
770
        outputs = super().__call__(*args, **kwargs)
Zhiyu Lin's avatar
Zhiyu Lin committed
771
        scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
772
773
        if self.return_all_scores:
            return [
774
                [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
775
776
777
778
779
780
                for item in scores
            ]
        else:
            return [
                {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores
            ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
781
782


Julien Chaumond's avatar
Julien Chaumond committed
783
784
class FillMaskPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
785
786
787
788
789
790
791
792
793
794
    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.

    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "fill-mask", for predicting masked tokens in a sequence.

    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
    which includes the bi-directional models in the library.
795
796
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=lm-head>`__.
Lysandre Debut's avatar
Lysandre Debut committed
797
798

    Arguments:
799
800
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
801
802
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
803
804
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
805
806
807
808
809
810
811
812
813
814
815
816
817
818
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Julien Chaumond's avatar
Julien Chaumond committed
819
820
821
822
    """

    def __init__(
        self,
823
824
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
825
        modelcard: Optional[ModelCard] = None,
Julien Chaumond's avatar
Julien Chaumond committed
826
827
828
829
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        topk=5,
830
        task: str = "",
Julien Chaumond's avatar
Julien Chaumond committed
831
832
833
834
835
836
837
838
839
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
840
            task=task,
Julien Chaumond's avatar
Julien Chaumond committed
841
842
843
844
        )

        self.topk = topk

845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
    def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):
        numel = np.prod(masked_index.shape)
        if numel > 1:
            raise PipelineException(
                "fill-mask",
                self.model.base_model_prefix,
                f"More than one mask_token ({self.tokenizer.mask_token}) is not supported",
            )
        elif numel < 1:
            raise PipelineException(
                "fill-mask",
                self.model.base_model_prefix,
                f"No mask_token ({self.tokenizer.mask_token}) found on the input",
            )

Julien Chaumond's avatar
Julien Chaumond committed
860
861
862
863
864
865
866
867
868
869
870
871
    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
        outputs = self._forward(inputs, return_tensors=True)

        results = []
        batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)

        for i in range(batch_size):
            input_ids = inputs["input_ids"][i]
            result = []

            if self.framework == "tf":
872
873
874
875
876
877
                masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()

                # Fill mask pipeline supports only one ${mask_token} per sample
                self.ensure_exactly_one_mask_token(masked_index)

                logits = outputs[i, masked_index.item(), :]
Julien Chaumond's avatar
Julien Chaumond committed
878
879
880
881
                probs = tf.nn.softmax(logits)
                topk = tf.math.top_k(probs, k=self.topk)
                values, predictions = topk.values.numpy(), topk.indices.numpy()
            else:
882
883
884
885
                masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero()

                # Fill mask pipeline supports only one ${mask_token} per sample
                self.ensure_exactly_one_mask_token(masked_index.numpy())
886

887
                logits = outputs[i, masked_index.item(), :]
Julien Chaumond's avatar
Julien Chaumond committed
888
889
890
891
892
893
894
895
                probs = logits.softmax(dim=0)
                values, predictions = probs.topk(self.topk)

            for v, p in zip(values.tolist(), predictions.tolist()):
                tokens = input_ids.numpy()
                tokens[masked_index] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
896
897
898
899
900
901
902
903
                result.append(
                    {
                        "sequence": self.tokenizer.decode(tokens),
                        "score": v,
                        "token": p,
                        "token_str": self.tokenizer.convert_ids_to_tokens(p),
                    }
                )
Julien Chaumond's avatar
Julien Chaumond committed
904
905
906
907
908
909
910
911
912

            # Append
            results += [result]

        if len(results) == 1:
            return results[0]
        return results


913
class TokenClassificationPipeline(Pipeline):
914
    """
Lysandre Debut's avatar
Lysandre Debut committed
915
916
917
918
919
920
921
922
923
    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.

    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.

    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
924
925
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
926
927

    Arguments:
928
929
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
930
931
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
932
933
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
934
935
936
937
938
939
940
941
942
943
944
945
946
947
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
948
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
949

950
951
952
953
    default_input_names = "sequences"

    def __init__(
        self,
954
955
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
956
        modelcard: Optional[ModelCard] = None,
957
958
959
960
961
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
962
        task: str = "",
963
        grouped_entities: bool = False,
964
965
966
967
968
969
970
971
972
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
973
            task=task,
974
        )
975
976

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
977
        self.ignore_labels = ignore_labels
978
        self.grouped_entities = grouped_entities
979

980
981
    def __call__(self, *args, **kwargs):
        inputs = self._args_parser(*args, **kwargs)
Julien Chaumond's avatar
Julien Chaumond committed
982
        answers = []
983
        for sentence in inputs:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
984

985
986
            # Manage correct placement of the tensors
            with self.device_placement():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
987

988
989
                tokens = self.tokenizer(
                    sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
990
                )
991
992

                # Forward
993
                if self.framework == "tf":
Funtowicz Morgan's avatar
Funtowicz Morgan committed
994
                    entities = self.model(tokens.data)[0][0].numpy()
995
                    input_ids = tokens["input_ids"].numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
996
                else:
997
                    with torch.no_grad():
998
                        tokens = self.ensure_tensor_on_device(**tokens)
999
                        entities = self.model(**tokens)[0][0].cpu().numpy()
1000
                        input_ids = tokens["input_ids"].cpu().numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1001

thomwolf's avatar
thomwolf committed
1002
1003
            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1004

1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
            entities = []
            entity_groups = []
            entity_group_disagg = []
            # Filter to labels not in `self.ignore_labels`
            filtered_labels_idx = [
                (idx, label_idx)
                for idx, label_idx in enumerate(labels_idx)
                if self.model.config.id2label[label_idx] not in self.ignore_labels
            ]

            for idx, label_idx in filtered_labels_idx:

                entity = {
                    "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
                    "score": score[idx][label_idx].item(),
                    "entity": self.model.config.id2label[label_idx],
                    "index": idx,
                }
                last_idx, _ = filtered_labels_idx[-1]
                if self.grouped_entities:
                    if not entity_group_disagg:
                        entity_group_disagg += [entity]
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                        continue

                    # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
                    if (
                        entity["entity"] == entity_group_disagg[-1]["entity"]
                        and entity["index"] == entity_group_disagg[-1]["index"] + 1
                    ):
                        entity_group_disagg += [entity]
                        # Group the entities at the last entity
                        if idx == last_idx:
                            entity_groups += [self.group_entities(entity_group_disagg)]
                    # If the current entity is different from the previous entity, aggregate the disaggregated entity group
                    else:
                        entity_groups += [self.group_entities(entity_group_disagg)]
                        entity_group_disagg = [entity]

                entities += [entity]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1046

1047
1048
1049
1050
            # Ensure if an entity is the latest one in the sequence it gets appended to the output
            if len(entity_group_disagg) > 0:
                entity_groups.append(self.group_entities(entity_group_disagg))

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1051
            # Append
1052
1053
1054
1055
1056
            if self.grouped_entities:
                answers += [entity_groups]
            else:
                answers += [entities]

thomwolf's avatar
thomwolf committed
1057
1058
        if len(answers) == 1:
            return answers[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1059
1060
        return answers

1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
    def group_entities(self, entities):
        """
        Returns grouped entities
        """
        # Get the last entity in the entity group
        entity = entities[-1]["entity"]
        scores = np.mean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
        }
        return entity_group

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1077

1078
NerPipeline = TokenClassificationPipeline
1079
1080


1081
1082
1083
1084
1085
1086
1087
1088
class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
    to internal SquadExample / SquadFeature structures.

    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
    arguments.
    """
1089

1090
1091
1092
1093
    def __call__(self, *args, **kwargs):
        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
        if args is not None and len(args) > 0:
            if len(args) == 1:
1094
                kwargs["X"] = args[0]
1095
            else:
1096
                kwargs["X"] = list(args)
1097

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1098
1099
        # Generic compatibility with sklearn and Keras
        # Batched data
1100
1101
        if "X" in kwargs or "data" in kwargs:
            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
1102

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1103
1104
1105
1106
1107
            if isinstance(inputs, dict):
                inputs = [inputs]
            else:
                # Copy to avoid overriding arguments
                inputs = [i for i in inputs]
1108

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1109
            for i, item in enumerate(inputs):
1110
                if isinstance(item, dict):
1111
1112
                    if any(k not in item for k in ["question", "context"]):
                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
1113

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1114
1115
1116
                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)

                elif not isinstance(item, SquadExample):
1117
                    raise ValueError(
1118
1119
1120
                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
                            "X" if "X" in kwargs else "data"
                        )
1121
1122
1123
                    )

            # Tabular input
1124
1125
1126
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], str):
                kwargs["question"] = [kwargs["question"]]
1127

1128
1129
            if isinstance(kwargs["context"], str):
                kwargs["context"] = [kwargs["context"]]
1130

1131
1132
1133
            inputs = [
                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
            ]
1134
        else:
1135
            raise ValueError("Unknown arguments {}".format(kwargs))
1136
1137
1138
1139
1140
1141
1142

        if not isinstance(inputs, list):
            inputs = [inputs]

        return inputs


Morgan Funtowicz's avatar
Morgan Funtowicz committed
1143
1144
class QuestionAnsweringPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
1145
1146
1147
1148
1149
1150
1151
1152
1153
    Question Answering pipeline using ModelForQuestionAnswering head. See the
    `question answering usage <../usage.html#question-answering>`__ examples for more information.

    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "question-answering", for answering questions given a context.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
1154
1155
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
Lysandre Debut's avatar
Lysandre Debut committed
1156
1157

    Arguments:
1158
1159
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1160
1161
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
1162
1163
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1178
1179
    """

1180
1181
1182
1183
    default_input_names = "question,context"

    def __init__(
        self,
1184
1185
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
1186
        modelcard: Optional[ModelCard] = None,
1187
1188
        framework: Optional[str] = None,
        device: int = -1,
1189
        task: str = "",
1190
1191
1192
1193
1194
1195
1196
1197
1198
        **kwargs
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=QuestionAnsweringArgumentHandler(),
            device=device,
1199
            task=task,
1200
            **kwargs,
1201
        )
thomwolf's avatar
thomwolf committed
1202

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1203
    @staticmethod
1204
1205
1206
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
1207
1208
1209
1210
        """
        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
        We currently support extractive question answering.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1211
        Arguments:
1212
1213
             question: (str, List[str]) The question to be ask for the associated context
             context: (str, List[str]) The context in which we will look for the answer.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1214
1215
1216

        Returns:
            SquadExample initialized with the corresponding question and context.
1217
1218
        """
        if isinstance(question, list):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1219
1220
1221
1222
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)

1223
    def __call__(self, *args, **kwargs):
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the intial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1238
        # Set defaults values
1239
1240
1241
1242
1243
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
1244
        kwargs.setdefault("handle_impossible_answer", False)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1245

1246
1247
        if kwargs["topk"] < 1:
            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1248

1249
1250
        if kwargs["max_answer_len"] < 1:
            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1251
1252

        # Convert inputs to features
1253
        examples = self._args_parser(*args, **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1254
1255
        features_list = [
            squad_convert_examples_to_features(
1256
1257
1258
1259
1260
1261
                examples=[example],
                tokenizer=self.tokenizer,
                max_seq_length=kwargs["max_seq_len"],
                doc_stride=kwargs["doc_stride"],
                max_query_length=kwargs["max_question_len"],
                is_training=False,
1262
                tqdm_enabled=False,
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1263
1264
1265
            )
            for example in examples
        ]
Rishabh Manoj's avatar
Rishabh Manoj committed
1266
1267
        all_answers = []
        for features, example in zip(features_list, examples):
Patrick von Platen's avatar
Patrick von Platen committed
1268
1269
            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
            fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
Rishabh Manoj's avatar
Rishabh Manoj committed
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283

            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                    start, end = self.model(fw_args)
                    start, end = start.numpy(), end.numpy()
                else:
                    with torch.no_grad():
                        # Retrieve the score for the context tokens only (removing question tokens)
                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
                        start, end = self.model(**fw_args)
                        start, end = start.cpu().numpy(), end.cpu().numpy()

1284
            min_null_score = 1000000  # large and positive
Rishabh Manoj's avatar
Rishabh Manoj committed
1285
1286
1287
1288
1289
1290
1291
            answers = []
            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1292
1293
1294
1295
                start_, end_ = (
                    start_ * np.abs(np.array(feature.p_mask) - 1),
                    end_ * np.abs(np.array(feature.p_mask) - 1),
                )
Rishabh Manoj's avatar
Rishabh Manoj committed
1296

1297
1298
1299
                if kwargs["handle_impossible_answer"]:
                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())

Rishabh Manoj's avatar
Rishabh Manoj committed
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
                start_[0] = end_[0] = 0

                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                char_to_word = np.array(example.char_to_word_offset)

                # Convert the answer (tokens) back to the original text
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                        ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
1317
1318
1319
1320

            if kwargs["handle_impossible_answer"]:
                answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1321
1322
1323
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
            all_answers += answers

Rishabh Manoj's avatar
Rishabh Manoj committed
1324
        if len(all_answers) == 1:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1325
            return all_answers[0]
Rishabh Manoj's avatar
Rishabh Manoj committed
1326
        return all_answers
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1327
1328

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
        """
        Take the output of any QuestionAnswering head and will generate probalities for each span to be
        the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than
        max_answer_len or answer end position being before the starting position.
        The method supports output the k-best answer through the topk argument.

        Args:
            start: numpy array, holding individual start probabilities for each token
            end: numpy array, holding individual end probabilities for each token
            topk: int, indicates how many possible answer span(s) to extract from the model's output
            max_answer_len: int, maximum size of the answer to extract from the model's output
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), max_answer_len - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if topk == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < topk:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, topk)[0:topk]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end]

    def span_to_answer(self, text: str, start: int, end: int):
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
        """
        When decoding from token probalities, this method maps token indexes to actual word in
        the initial context.

        Args:
            text: str, the actual context to extract the answer from
            start: int, starting answer token index
            end: int, ending answer token index

        Returns:
            dict: {'answer': str, 'start': int, 'end': int}
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # Append words if they are in the span
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)

                words += [word]

            # Stop if we went over the end of the answer
            if token_idx > end:
                break

            # Append the subtokenization length to the running index
            token_idx += len(token)
            chars_idx += len(word) + 1

        # Join text with spaces
1406
1407
1408
1409
1410
        return {
            "answer": " ".join(words),
            "start": max(0, char_start_idx),
            "end": min(len(text), char_end_idx),
        }
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1411
1412


1413
1414
1415
1416
1417
1418
class SummarizationPipeline(Pipeline):
    """
    Summarize news articles and other documents

    Usage::

1419
        # use bart in pytorch
1420
        summarizer = pipeline("summarization")
1421
1422
1423
1424
1425
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)

        # use t5 in tf
        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
1426

1427
1428
1429
1430
    The models that this pipeline can use are models that have been fine-tuned on a summarization task,
    which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.

            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

1461
1462
1463
1464
    def __init__(self, **kwargs):
        kwargs.update(task="summarization")
        super().__init__(**kwargs)

1465
    def __call__(
1466
        self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
    ):
        r"""
        Args:
            *documents: (list of strings) articles to be summarized
            return_text: (bool, default=True) whether to add a decoded "summary_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result

            clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output
            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize

        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate

        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
        assert len(documents) > 0, "Please provide a document to summarize"

        if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__:
            raise NotImplementedError(
                "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`"
            )

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

        if isinstance(documents[0], list):
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"

            documents = ([prefix + document for document in documents[0]],)
1500
            padding = True
1501
1502
1503

        elif isinstance(documents[0], str):
            documents = (prefix + documents[0],)
1504
            padding = False
1505
1506
1507
1508
1509
1510
1511
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
                    documents[0]
                )
            )

1512
        with self.device_placement():
1513
            inputs = self._parse_and_tokenize(*documents, padding=padding)
1514
1515
1516
1517
1518

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]
            elif self.framework == "tf":
1519
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()
1520

1521
1522
            min_length = generate_kwargs.get("min_length", self.model.config.min_length)
            if input_length < min_length // 2:
1523
                logger.warning(
1524
                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
1525
                        min_length, input_length
1526
1527
1528
                    )
                )

1529
1530
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length < max_length:
1531
                logger.warning(
1532
                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
1533
                        max_length, input_length
1534
1535
1536
                    )
                )

1537
            summaries = self.model.generate(
1538
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
1539
            )
1540

1541
1542
1543
1544
1545
1546
1547
            results = []
            for summary in summaries:
                record = {}
                if return_tensors:
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results


class TranslationPipeline(Pipeline):
    """
    Translates from one language to another.

    Usage::
        en_fr_translator = pipeline("translation_en_to_fr")
        en_fr_translator("How old are you?")

1562
1563
1564
1565
    The models that this pipeline can use are models that have been fine-tuned on a translation task,
    currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.
            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.
            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
1594
        self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1595
1596
1597
    ):
        r"""
        Args:
1598
            *args: (list of strings) texts to be translated
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
            return_text: (bool, default=True) whether to add a decoded "translation_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result

            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate
        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate
        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

1613
        if isinstance(args[0], list):
1614
1615
1616
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
1617
            args = ([prefix + text for text in args[0]],)
1618
            padding = True
1619

1620
1621
        elif isinstance(args[0], str):
            args = (prefix + args[0],)
1622
            padding = False
1623
1624
1625
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
1626
                    args[0]
1627
1628
1629
1630
                )
            )

        with self.device_placement():
1631
            inputs = self._parse_and_tokenize(*args, padding=padding)
1632
1633
1634
1635
1636
1637
1638
1639

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]

            elif self.framework == "tf":
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()

1640
1641
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length > 0.9 * max_length:
1642
1643
                logger.warning(
                    "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
1644
                        input_length, max_length
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
                    )
                )

            translations = self.model.generate(
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
            )
            results = []
            for translation in translations:
                record = {}
                if return_tensors:
                    record["translation_token_ids"] = translation
                if return_text:
                    record["translation_text"] = self.tokenizer.decode(
                        translation,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1661
1662
1663
1664
1665
                    )
                results.append(record)
            return results


1666
# Register all the supported tasks here
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1667
SUPPORTED_TASKS = {
1668
1669
1670
1671
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
        "tf": TFAutoModel if is_tf_available() else None,
        "pt": AutoModel if is_torch_available() else None,
1672
        "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}},
1673
    },
1674
1675
1676
1677
1678
1679
    "sentiment-analysis": {
        "impl": TextClassificationPipeline,
        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
        "default": {
            "model": {
1680
1681
                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
1682
            },
1683
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1684
    },
1685
    "ner": {
1686
        "impl": TokenClassificationPipeline,
1687
1688
1689
1690
        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
        "pt": AutoModelForTokenClassification if is_torch_available() else None,
        "default": {
            "model": {
Julien Chaumond's avatar
Julien Chaumond committed
1691
1692
                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
1693
            },
1694
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1695
    },
1696
1697
1698
1699
1700
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
        "default": {
Lysandre's avatar
E231  
Lysandre committed
1701
            "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
1702
1703
        },
    },
Julien Chaumond's avatar
Julien Chaumond committed
1704
1705
1706
1707
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1708
        "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}},
Julien Chaumond's avatar
Julien Chaumond committed
1709
    },
1710
1711
    "summarization": {
        "impl": SummarizationPipeline,
1712
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
1713
1714
        "pt": AutoModelForSeq2SeqLM if is_torch_available() else None,
        "default": {"model": {"pt": "sshleifer/distilbart-cnn-12-6", "tf": "t5-small"}},
1715
    },
1716
1717
1718
1719
    "translation_en_to_fr": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1720
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1721
1722
1723
1724
1725
    },
    "translation_en_to_de": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1726
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1727
1728
1729
1730
1731
    },
    "translation_en_to_ro": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1732
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
1733
    },
1734
1735
1736
1737
    "text-generation": {
        "impl": TextGenerationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1738
        "default": {"model": {"pt": "gpt2", "tf": "gpt2"}},
1739
    },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1740
1741
1742
}


1743
1744
1745
1746
1747
def pipeline(
    task: str,
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
1748
    framework: Optional[str] = None,
1749
1750
    **kwargs
) -> Pipeline:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1751
    """
1752
    Utility factory method to build a pipeline.
Lysandre Debut's avatar
Lysandre Debut committed
1753

1754
    Pipeline are made of:
1755

Lysandre Debut's avatar
Lysandre Debut committed
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
        - A Tokenizer instance in charge of mapping raw textual input to token
        - A Model instance
        - Some (optional) post processing for enhancing model's output


    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:

            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
1767
            - "ner": will return a :class:`~transformers.TokenClassificationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1768
1769
            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
1770
1771
            - "summarization": will return a :class:`~transformers.SummarizationPipeline`
            - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline`
1772
            - "text-generation": will return a :class:`~transformers.TextGenerationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1773
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
1774
1775
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`,
            a model identifier or an actual pre-trained model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1776
1777
1778
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

1779
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1780
1781
        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
1782
            a model identifier or an actual pre-trained model configuration inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1783
1784
            :class:`~transformers.PretrainedConfig`.

1785
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1786
1787
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
1788
            a model identifier or an actual pre-trained tokenizer inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1789
1790
            :class:`~transformers.PreTrainedTokenizer`.

1791
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.

    Returns:
        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
        the task.

    Examples::

        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

        # Sentiment analysis pipeline
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1808
        pipeline('sentiment-analysis')
Lysandre Debut's avatar
Lysandre Debut committed
1809
1810

        # Question answering pipeline, specifying the checkpoint identifier
1811
        pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
Lysandre Debut's avatar
Lysandre Debut committed
1812
1813
1814
1815
1816

        # Named entity recognition pipeline, passing in a specific model and tokenizer
        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        pipeline('ner', model=model, tokenizer=tokenizer)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1817
    """
1818
    # Retrieve the task
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1819
1820
1821
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

1822
    framework = framework or get_framework(model)
1823

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1824
    targeted_task = SUPPORTED_TASKS[task]
1825
    task_class, model_class = targeted_task["impl"], targeted_task[framework]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1826

1827
    # Use default model/config/tokenizer for the task if no model is provided
1828
    if model is None:
1829
        model = targeted_task["default"]["model"][framework]
1830

1831
1832
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
1833
        if isinstance(model, str):
1834
            tokenizer = model
1835
        elif isinstance(config, str):
1836
1837
1838
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
1839
1840
            raise Exception(
                "Impossible to guess which tokenizer to use. "
1841
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
1842
            )
1843

Lysandre Debut's avatar
Lysandre Debut committed
1844
    modelcard = None
1845
    # Try to infer modelcard from model or config name (if provided as str)
Lysandre Debut's avatar
Lysandre Debut committed
1846
1847
1848
1849
    if isinstance(model, str):
        modelcard = model
    elif isinstance(config, str):
        modelcard = config
1850
1851

    # Instantiate tokenizer if needed
1852
1853
1854
1855
1856
1857
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
1858
1859
1860
1861

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)
1862

thomwolf's avatar
thomwolf committed
1863
1864
1865
1866
    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

1867
    # Instantiate model if needed
1868
    if isinstance(model, str):
1869
1870
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch."
            )
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow."
            )
1883
        model = model_class.from_pretrained(model, config=config, **model_kwargs)
1884

1885
    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)