pipelines.py 68.7 KB
Newer Older
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
import csv
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import logging
Morgan Funtowicz's avatar
Morgan Funtowicz committed
20
import os
21
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
22
import sys
Morgan Funtowicz's avatar
Morgan Funtowicz committed
23
from abc import ABC, abstractmethod
24
from contextlib import contextmanager
25
from os.path import abspath, exists
Patrick von Platen's avatar
Patrick von Platen committed
26
from typing import List, Optional, Tuple, Union
Morgan Funtowicz's avatar
Morgan Funtowicz committed
27
28
29

import numpy as np

30
31
32
33
34
35
36
37
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
Morgan Funtowicz's avatar
Morgan Funtowicz committed
38

Aymeric Augustin's avatar
Aymeric Augustin committed
39

Morgan Funtowicz's avatar
Morgan Funtowicz committed
40
if is_tf_available():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
41
    import tensorflow as tf
42
    from .modeling_tf_auto import (
43
44
45
46
        TFAutoModel,
        TFAutoModelForSequenceClassification,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
47
        TFAutoModelWithLMHead,
48
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
49
50
51

if is_torch_available():
    import torch
52
    from .modeling_auto import (
53
54
55
56
        AutoModel,
        AutoModelForSequenceClassification,
        AutoModelForQuestionAnswering,
        AutoModelForTokenClassification,
Julien Chaumond's avatar
Julien Chaumond committed
57
        AutoModelWithLMHead,
58
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
59
60


61
62
logger = logging.getLogger(__name__)

63

thomwolf's avatar
thomwolf committed
64
def get_framework(model=None):
65
    """ Select framework (TensorFlow/PyTorch) to use.
66
        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
67
    """
thomwolf's avatar
thomwolf committed
68
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
Julien Chaumond's avatar
Julien Chaumond committed
69
        # Both framework are available but the user supplied a model class instance.
thomwolf's avatar
thomwolf committed
70
        # Try to guess which framework to use from the model classname
71
        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
72
    elif not is_tf_available() and not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
73
        raise RuntimeError(
74
75
76
77
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
78
    else:
79
        # framework = 'tf' if is_tf_available() else 'pt'
80
        framework = "pt" if is_torch_available() else "tf"
thomwolf's avatar
thomwolf committed
81
82
    return framework

83

84
85
86
87
class ArgumentHandler(ABC):
    """
    Base interface for handling varargs for each Pipeline
    """
88

89
90
91
    @abstractmethod
    def __call__(self, *args, **kwargs):
        raise NotImplementedError()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
92
93


94
95
96
97
class DefaultArgumentHandler(ArgumentHandler):
    """
    Default varargs argument parser handling parameters for each Pipeline
    """
98

99
    def __call__(self, *args, **kwargs):
100
101
102
103
        if "X" in kwargs:
            return kwargs["X"]
        elif "data" in kwargs:
            return kwargs["data"]
104
105
106
107
108
109
        elif len(args) == 1:
            if isinstance(args[0], list):
                return args[0]
            else:
                return [args[0]]
        elif len(args) > 1:
110
            return list(args)
111
        raise ValueError("Unable to infer the format of the provided data (X=, data=, ...)")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
112
113


114
class PipelineDataFormat:
115
116
117
118
119
    """
    Base class for all the pipeline supported data format both for reading and writing.
    Supported data formats currently includes:
     - JSON
     - CSV
thomwolf's avatar
thomwolf committed
120
     - stdin/stdout (pipe)
121
122
123
124

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
125
126

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
127

128
129
130
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
131
132
        self.output_path = output_path
        self.input_path = input_path
133
        self.column = column.split(",") if column is not None else [""]
134
135
136
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
137
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
138

thomwolf's avatar
thomwolf committed
139
        if output_path is not None and not overwrite:
thomwolf's avatar
thomwolf committed
140
            if exists(abspath(self.output_path)):
141
                raise OSError("{} already exists on disk".format(self.output_path))
142

thomwolf's avatar
thomwolf committed
143
144
        if input_path is not None:
            if not exists(abspath(self.input_path)):
145
                raise OSError("{} doesnt exist on disk".format(self.input_path))
146
147
148
149
150
151
152

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: dict):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
153
154
155
156
157
        """
        Save the provided data object with the representation for the current `DataFormat`.
        :param data: data to store
        :return:
        """
158
159
        raise NotImplementedError()

160
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
161
162
163
164
165
        """
        Save the provided data object as a pickle-formatted binary data on the disk.
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
thomwolf's avatar
thomwolf committed
166
        path, _ = os.path.splitext(self.output_path)
167
        binary_path = os.path.extsep.join((path, "pickle"))
168

169
        with open(binary_path, "wb+") as f_output:
170
171
172
173
            pickle.dump(data, f_output)

        return binary_path

174
    @staticmethod
175
    def from_str(
176
        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
177
178
    ):
        if format == "json":
thomwolf's avatar
thomwolf committed
179
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
180
        elif format == "csv":
thomwolf's avatar
thomwolf committed
181
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
182
        elif format == "pipe":
thomwolf's avatar
thomwolf committed
183
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
184
        else:
185
            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
186
187
188


class CsvPipelineDataFormat(PipelineDataFormat):
189
190
191
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
192
        super().__init__(output_path, input_path, column, overwrite=overwrite)
193
194

    def __iter__(self):
195
        with open(self.input_path, "r") as f:
196
197
198
199
200
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}
                else:
201
                    yield row[self.column[0]]
202
203

    def save(self, data: List[dict]):
204
        with open(self.output_path, "w") as f:
205
206
207
208
209
210
211
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
                writer.writerows(data)


class JsonPipelineDataFormat(PipelineDataFormat):
212
213
214
    def __init__(
        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
    ):
thomwolf's avatar
thomwolf committed
215
        super().__init__(output_path, input_path, column, overwrite=overwrite)
216

217
        with open(input_path, "r") as f:
218
219
220
221
222
223
224
            self._entries = json.load(f)

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}
            else:
225
                yield entry[self.column[0]]
226
227

    def save(self, data: dict):
228
        with open(self.output_path, "w") as f:
229
230
231
            json.dump(data, f)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
232
233
234
235
236
237
238
class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process.
    For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
239

Morgan Funtowicz's avatar
Morgan Funtowicz committed
240
241
242
    def __iter__(self):
        for line in sys.stdin:
            # Split for multi-columns
243
            if "\t" in line:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
244

245
                line = line.split("\t")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
246
247
248
249
250
251
252
253
254
255
256
257
258
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        print(data)

259
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
thomwolf's avatar
thomwolf committed
260
        if self.output_path is None:
261
            raise KeyError(
262
263
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
264
265
266
267
            )

        return super().save_binary(data)

Morgan Funtowicz's avatar
Morgan Funtowicz committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        raise NotImplementedError()


283
class Pipeline(_ScikitCompat):
284
    """
Lysandre Debut's avatar
Lysandre Debut committed
285
286
287
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.

288
289
290
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Morgan Funtowicz's avatar
Morgan Funtowicz committed
291
292
293
294
295
296
297
298
299
300

    Pipeline supports running on CPU or GPU through the device argument. Users can specify
    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.

    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
    provide the binary_output constructor argument. If set to True, the output will be stored in the
    pickle format.

    Arguments:
301
302
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
303
304
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
305
306
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
307
308
309
310
311
312
313
314
315
316
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
317
            Reference to the object in charge of parsing supplied pipeline parameters.
Lysandre Debut's avatar
Lysandre Debut committed
318
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
319
320
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Lysandre Debut's avatar
Lysandre Debut committed
321
        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
322
323
324
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.

    Return:
Lysandre Debut's avatar
Lysandre Debut committed
325
        :obj:`List` or :obj:`Dict`:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
326
        Pipeline returns list or dictionary depending on:
Lysandre Debut's avatar
Lysandre Debut committed
327
328
329

         - Whether the user supplied multiple samples
         - Whether the pipeline exposes multiple fields in the output object
330
    """
thomwolf's avatar
thomwolf committed
331
332
333

    default_input_names = None

334
335
    def __init__(
        self,
336
337
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
338
        modelcard: Optional[ModelCard] = None,
339
        framework: Optional[str] = None,
340
        task: str = "",
341
342
343
344
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
    ):
345

thomwolf's avatar
thomwolf committed
346
347
348
        if framework is None:
            framework = get_framework()

349
350
        self.model = model
        self.tokenizer = tokenizer
351
        self.modelcard = modelcard
thomwolf's avatar
thomwolf committed
352
        self.framework = framework
353
        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
354
        self.binary_output = binary_output
355
356
        self._args_parser = args_parser or DefaultArgumentHandler()

357
        # Special handling
358
359
        if self.framework == "pt" and self.device.type == "cuda":
            self.model = self.model.to(self.device)
360

361
362
363
364
365
        # Update config with task specific parameters
        task_specific_params = self.model.config.task_specific_params
        if task_specific_params is not None and task in task_specific_params:
            self.model.config.update(task_specific_params.get(task))

366
    def save_pretrained(self, save_directory):
367
368
369
        """
        Save the pipeline's model and tokenizer to the specified save_directory
        """
370
371
372
373
374
375
        if not os.path.isdir(save_directory):
            logger.error("Provided path ({}) should be a directory".format(save_directory))
            return

        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)
376
377
        if self.modelcard is not None:
            self.modelcard.save_pretrained(save_directory)
378
379

    def transform(self, X):
380
381
382
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
383
384
385
        return self(X=X)

    def predict(self, X):
386
387
388
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
389
        return self(X=X)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
390

391
392
    @contextmanager
    def device_placement(self):
393
394
395
396
397
398
399
400
401
402
403
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
        example:
            # Explicitly ask for tensor allocation on CUDA device :0
            nlp = pipeline(..., device=0)
            with nlp.device_placement():
                # Every framework specific tensor allocation will be done on the request device
                output = nlp(...)
        Returns:
            Context manager
        """
404
405
        if self.framework == "tf":
            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
406
407
                yield
        else:
408
            if self.device.type == "cuda":
409
                torch.cuda.set_device(self.device)
410

411
            yield
412

413
414
415
416
417
418
419
420
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.
        :param inputs:
        :return:
        """
        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

421
    def _parse_and_tokenize(self, *texts, pad_to_max_length=False, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
422
423
424
        """
        Parse arguments and tokenize
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
425
426
        # Parse arguments
        inputs = self._args_parser(*texts, **kwargs)
427
        inputs = self.tokenizer.batch_encode_plus(
428
            inputs, add_special_tokens=True, return_tensors=self.framework, pad_to_max_length=pad_to_max_length,
429
        )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
430

Julien Chaumond's avatar
Julien Chaumond committed
431
432
433
434
        return inputs

    def __call__(self, *texts, **kwargs):
        inputs = self._parse_and_tokenize(*texts, **kwargs)
435
        return self._forward(inputs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
436

Julien Chaumond's avatar
Julien Chaumond committed
437
    def _forward(self, inputs, return_tensors=False):
438
439
440
441
        """
        Internal framework specific forward dispatching.
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
Julien Chaumond's avatar
Julien Chaumond committed
442
            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
443
444
445
        Returns:
            Numpy array
        """
446
447
448
449
        # Encode for forward
        with self.device_placement():
            if self.framework == "tf":
                # TODO trace model
Funtowicz Morgan's avatar
Funtowicz Morgan committed
450
                predictions = self.model(inputs.data, training=False)[0]
451
452
453
454
            else:
                with torch.no_grad():
                    inputs = self.ensure_tensor_on_device(**inputs)
                    predictions = self.model(**inputs)[0].cpu()
455

Julien Chaumond's avatar
Julien Chaumond committed
456
457
458
459
        if return_tensors:
            return predictions
        else:
            return predictions.numpy()
460
461
462


class FeatureExtractionPipeline(Pipeline):
463
    """
Lysandre Debut's avatar
Lysandre Debut committed
464
    Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
465
    which can be used as features in downstream tasks.
Lysandre Debut's avatar
Lysandre Debut committed
466
467
468
469
470
471
472
473
474
475

    This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "feature-extraction", for extracting features of a sequence.

    All models may be used for this pipeline. See a list of all models, including community-contributed models on
    `huggingface.co/models <https://huggingface.co/models>`__.

    Arguments:
476
477
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
478
479
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
480
481
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
482
483
484
485
486
487
488
489
490
491
492
493
494
495
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
496
    """
497

498
499
    def __init__(
        self,
500
501
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
502
        modelcard: Optional[ModelCard] = None,
503
504
505
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
506
        task: str = "",
507
508
509
510
511
512
513
514
515
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
516
            task=task,
517
        )
518

519
520
    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs).tolist()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
521
522


Morgan Funtowicz's avatar
Morgan Funtowicz committed
523
class TextClassificationPipeline(Pipeline):
524
    """
Lysandre Debut's avatar
Lysandre Debut committed
525
526
527
528
529
530
531
532
533
    Text classification pipeline using ModelForSequenceClassification head. See the
    `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.

    This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "sentiment-analysis", for classifying sequences according to positive or negative sentiments.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
534
535
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
536
537

    Arguments:
538
539
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
540
541
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
542
543
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
544
545
546
547
548
549
550
551
552
553
554
555
556
557
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
558
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
559

560
    def __call__(self, *args, **kwargs):
561
562
        outputs = super().__call__(*args, **kwargs)
        scores = np.exp(outputs) / np.exp(outputs).sum(-1)
563
        return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
564
565


Julien Chaumond's avatar
Julien Chaumond committed
566
567
class FillMaskPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
568
569
570
571
572
573
574
575
576
577
    Masked language modeling prediction pipeline using ModelWithLMHead head. See the
    `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.

    This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "fill-mask", for predicting masked tokens in a sequence.

    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
    which includes the bi-directional models in the library.
578
579
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=lm-head>`__.
Lysandre Debut's avatar
Lysandre Debut committed
580
581

    Arguments:
582
583
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
584
585
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
586
587
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Julien Chaumond's avatar
Julien Chaumond committed
602
603
604
605
    """

    def __init__(
        self,
606
607
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
608
        modelcard: Optional[ModelCard] = None,
Julien Chaumond's avatar
Julien Chaumond committed
609
610
611
612
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        topk=5,
613
        task: str = "",
Julien Chaumond's avatar
Julien Chaumond committed
614
615
616
617
618
619
620
621
622
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
623
            task=task,
Julien Chaumond's avatar
Julien Chaumond committed
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
        )

        self.topk = topk

    def __call__(self, *args, **kwargs):
        inputs = self._parse_and_tokenize(*args, **kwargs)
        outputs = self._forward(inputs, return_tensors=True)

        results = []
        batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)

        for i in range(batch_size):
            input_ids = inputs["input_ids"][i]
            result = []

            if self.framework == "tf":
                masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item()
                logits = outputs[i, masked_index, :]
                probs = tf.nn.softmax(logits)
                topk = tf.math.top_k(probs, k=self.topk)
                values, predictions = topk.values.numpy(), topk.indices.numpy()
            else:
                masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item()
                logits = outputs[i, masked_index, :]
                probs = logits.softmax(dim=0)
                values, predictions = probs.topk(self.topk)

            for v, p in zip(values.tolist(), predictions.tolist()):
                tokens = input_ids.numpy()
                tokens[masked_index] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
                result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p})

            # Append
            results += [result]

        if len(results) == 1:
            return results[0]
        return results


Morgan Funtowicz's avatar
Morgan Funtowicz committed
666
class NerPipeline(Pipeline):
667
    """
Lysandre Debut's avatar
Lysandre Debut committed
668
669
670
671
672
673
674
675
676
    Named Entity Recognition pipeline using ModelForTokenClassification head. See the
    `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.

    This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.

    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
677
678
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
Lysandre Debut's avatar
Lysandre Debut committed
679
680

    Arguments:
681
682
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
683
684
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
685
686
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
687
688
689
690
691
692
693
694
695
696
697
698
699
700
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
701
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
702

703
704
705
706
    default_input_names = "sequences"

    def __init__(
        self,
707
708
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
709
        modelcard: Optional[ModelCard] = None,
710
711
712
713
714
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
715
        task: str = "",
716
717
718
719
720
721
722
723
724
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
725
            task=task,
726
        )
727
728

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
729
        self.ignore_labels = ignore_labels
730

Morgan Funtowicz's avatar
Morgan Funtowicz committed
731
    def __call__(self, *texts, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
732
733
        inputs = self._args_parser(*texts, **kwargs)
        answers = []
734
        for sentence in inputs:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
735

736
737
            # Manage correct placement of the tensors
            with self.device_placement():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
738

739
                tokens = self.tokenizer.encode_plus(
740
741
                    sentence,
                    return_attention_mask=False,
thomwolf's avatar
thomwolf committed
742
                    return_tensors=self.framework,
743
                    max_length=self.tokenizer.max_len,
744
                )
745
746

                # Forward
747
                if self.framework == "tf":
Funtowicz Morgan's avatar
Funtowicz Morgan committed
748
                    entities = self.model(tokens.data)[0][0].numpy()
749
                    input_ids = tokens["input_ids"].numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
750
                else:
751
                    with torch.no_grad():
752
                        tokens = self.ensure_tensor_on_device(**tokens)
753
                        entities = self.model(**tokens)[0][0].cpu().numpy()
754
                        input_ids = tokens["input_ids"].cpu().numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
755

thomwolf's avatar
thomwolf committed
756
757
            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
758

thomwolf's avatar
thomwolf committed
759
760
761
            answer = []
            for idx, label_idx in enumerate(labels_idx):
                if self.model.config.id2label[label_idx] not in self.ignore_labels:
762
763
                    answer += [
                        {
764
                            "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
765
766
767
768
                            "score": score[idx][label_idx].item(),
                            "entity": self.model.config.id2label[label_idx],
                        }
                    ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
769
770
771

            # Append
            answers += [answer]
thomwolf's avatar
thomwolf committed
772
773
        if len(answers) == 1:
            return answers[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
774
775
776
        return answers


777
778
779
TokenClassificationPipeline = NerPipeline


780
781
782
783
784
785
786
787
class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
    to internal SquadExample / SquadFeature structures.

    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
    arguments.
    """
788

789
790
791
792
    def __call__(self, *args, **kwargs):
        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
        if args is not None and len(args) > 0:
            if len(args) == 1:
793
                kwargs["X"] = args[0]
794
            else:
795
                kwargs["X"] = list(args)
796

Morgan Funtowicz's avatar
Morgan Funtowicz committed
797
798
        # Generic compatibility with sklearn and Keras
        # Batched data
799
800
        if "X" in kwargs or "data" in kwargs:
            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
801

Morgan Funtowicz's avatar
Morgan Funtowicz committed
802
803
804
805
806
            if isinstance(inputs, dict):
                inputs = [inputs]
            else:
                # Copy to avoid overriding arguments
                inputs = [i for i in inputs]
807

Morgan Funtowicz's avatar
Morgan Funtowicz committed
808
            for i, item in enumerate(inputs):
809
                if isinstance(item, dict):
810
811
                    if any(k not in item for k in ["question", "context"]):
                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
812

Morgan Funtowicz's avatar
Morgan Funtowicz committed
813
814
815
                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)

                elif not isinstance(item, SquadExample):
816
                    raise ValueError(
817
818
819
                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
                            "X" if "X" in kwargs else "data"
                        )
820
821
822
                    )

            # Tabular input
823
824
825
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], str):
                kwargs["question"] = [kwargs["question"]]
826

827
828
            if isinstance(kwargs["context"], str):
                kwargs["context"] = [kwargs["context"]]
829

830
831
832
            inputs = [
                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
            ]
833
        else:
834
            raise ValueError("Unknown arguments {}".format(kwargs))
835
836
837
838
839
840
841

        if not isinstance(inputs, list):
            inputs = [inputs]

        return inputs


Morgan Funtowicz's avatar
Morgan Funtowicz committed
842
843
class QuestionAnsweringPipeline(Pipeline):
    """
Lysandre Debut's avatar
Lysandre Debut committed
844
845
846
847
848
849
850
851
852
    Question Answering pipeline using ModelForQuestionAnswering head. See the
    `question answering usage <../usage.html#question-answering>`__ examples for more information.

    This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
    the following task identifier(s):

    - "question-answering", for answering questions given a context.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
853
854
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
Lysandre Debut's avatar
Lysandre Debut committed
855
856

    Arguments:
857
858
        model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
859
860
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
861
862
        tokenizer (:obj:`~transformers.PreTrainedTokenizer`):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
Lysandre Debut's avatar
Lysandre Debut committed
863
864
865
866
867
868
869
870
871
872
873
874
875
876
            :class:`~transformers.PreTrainedTokenizer`.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
877
878
    """

879
880
881
882
    default_input_names = "question,context"

    def __init__(
        self,
883
884
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
885
        modelcard: Optional[ModelCard] = None,
886
887
        framework: Optional[str] = None,
        device: int = -1,
888
        task: str = "",
889
890
891
892
893
894
895
896
897
        **kwargs
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=QuestionAnsweringArgumentHandler(),
            device=device,
898
            task=task,
899
            **kwargs,
900
        )
thomwolf's avatar
thomwolf committed
901

Morgan Funtowicz's avatar
Morgan Funtowicz committed
902
    @staticmethod
903
904
905
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
906
907
908
909
        """
        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
        We currently support extractive question answering.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
910
        Arguments:
911
912
             question: (str, List[str]) The question to be ask for the associated context
             context: (str, List[str]) The context in which we will look for the answer.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
913
914
915

        Returns:
            SquadExample initialized with the corresponding question and context.
916
917
        """
        if isinstance(question, list):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
918
919
920
921
922
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)

    def __call__(self, *texts, **kwargs):
923
924
925
926
927
928
929
930
931
932
933
934
935
936
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the intial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
937
        # Set defaults values
938
939
940
941
942
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
943
        kwargs.setdefault("handle_impossible_answer", False)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
944

945
946
        if kwargs["topk"] < 1:
            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
947

948
949
        if kwargs["max_answer_len"] < 1:
            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
950
951

        # Convert inputs to features
952
        examples = self._args_parser(*texts, **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
953
954
955
956
957
958
959
960
961
962
963
        features_list = [
            squad_convert_examples_to_features(
                [example],
                self.tokenizer,
                kwargs["max_seq_len"],
                kwargs["doc_stride"],
                kwargs["max_question_len"],
                False,
            )
            for example in examples
        ]
Rishabh Manoj's avatar
Rishabh Manoj committed
964
965
        all_answers = []
        for features, example in zip(features_list, examples):
Patrick von Platen's avatar
Patrick von Platen committed
966
967
            model_input_names = self.tokenizer.model_input_names + ["input_ids"]
            fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
Rishabh Manoj's avatar
Rishabh Manoj committed
968
969
970
971
972
973
974
975
976
977
978
979
980
981

            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                    start, end = self.model(fw_args)
                    start, end = start.numpy(), end.numpy()
                else:
                    with torch.no_grad():
                        # Retrieve the score for the context tokens only (removing question tokens)
                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
                        start, end = self.model(**fw_args)
                        start, end = start.cpu().numpy(), end.cpu().numpy()

982
            min_null_score = 1000000  # large and positive
Rishabh Manoj's avatar
Rishabh Manoj committed
983
984
985
986
987
988
989
            answers = []
            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
Morgan Funtowicz's avatar
Morgan Funtowicz committed
990
991
992
993
                start_, end_ = (
                    start_ * np.abs(np.array(feature.p_mask) - 1),
                    end_ * np.abs(np.array(feature.p_mask) - 1),
                )
Rishabh Manoj's avatar
Rishabh Manoj committed
994

995
996
997
                if kwargs["handle_impossible_answer"]:
                    min_null_score = min(min_null_score, (start_[0] * end_[0]).item())

Rishabh Manoj's avatar
Rishabh Manoj committed
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
                start_[0] = end_[0] = 0

                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                char_to_word = np.array(example.char_to_word_offset)

                # Convert the answer (tokens) back to the original text
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                        ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
1015
1016
1017
1018

            if kwargs["handle_impossible_answer"]:
                answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1019
1020
1021
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
            all_answers += answers

Rishabh Manoj's avatar
Rishabh Manoj committed
1022
        if len(all_answers) == 1:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1023
            return all_answers[0]
Rishabh Manoj's avatar
Rishabh Manoj committed
1024
        return all_answers
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1025
1026

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
        """
        Take the output of any QuestionAnswering head and will generate probalities for each span to be
        the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than
        max_answer_len or answer end position being before the starting position.
        The method supports output the k-best answer through the topk argument.

        Args:
            start: numpy array, holding individual start probabilities for each token
            end: numpy array, holding individual end probabilities for each token
            topk: int, indicates how many possible answer span(s) to extract from the model's output
            max_answer_len: int, maximum size of the answer to extract from the model's output
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), max_answer_len - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if topk == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < topk:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, topk)[0:topk]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end]

    def span_to_answer(self, text: str, start: int, end: int):
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
        """
        When decoding from token probalities, this method maps token indexes to actual word in
        the initial context.

        Args:
            text: str, the actual context to extract the answer from
            start: int, starting answer token index
            end: int, ending answer token index

        Returns:
            dict: {'answer': str, 'start': int, 'end': int}
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # Append words if they are in the span
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)

                words += [word]

            # Stop if we went over the end of the answer
            if token_idx > end:
                break

            # Append the subtokenization length to the running index
            token_idx += len(token)
            chars_idx += len(word) + 1

        # Join text with spaces
1104
1105
1106
1107
1108
        return {
            "answer": " ".join(words),
            "start": max(0, char_start_idx),
            "end": min(len(text), char_end_idx),
        }
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1109
1110


1111
1112
1113
1114
1115
1116
class SummarizationPipeline(Pipeline):
    """
    Summarize news articles and other documents

    Usage::

1117
        # use bart in pytorch
1118
        summarizer = pipeline("summarization")
1119
1120
1121
1122
1123
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)

        # use t5 in tf
        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
        summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20)
1124

1125
1126
1127
1128
    The models that this pipeline can use are models that have been fine-tuned on a summarization task,
    which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.

            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
1160
        self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
    ):
        r"""
        Args:
            *documents: (list of strings) articles to be summarized
            return_text: (bool, default=True) whether to add a decoded "summary_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result

            clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output
            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize

        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate

        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
        assert len(documents) > 0, "Please provide a document to summarize"

        if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__:
            raise NotImplementedError(
                "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`"
            )

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

        if isinstance(documents[0], list):
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"

            documents = ([prefix + document for document in documents[0]],)
            pad_to_max_length = True

        elif isinstance(documents[0], str):
            documents = (prefix + documents[0],)
            pad_to_max_length = False
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
                    documents[0]
                )
            )

1206
        with self.device_placement():
1207
1208
1209
1210
1211
1212
            inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length)

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]
            elif self.framework == "tf":
1213
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()
1214

1215
1216
            min_length = generate_kwargs.get("min_length", self.model.config.min_length)
            if input_length < min_length // 2:
1217
                logger.warning(
1218
                    "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
1219
                        min_length, input_length
1220
1221
1222
                    )
                )

1223
1224
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length < max_length:
1225
                logger.warning(
1226
                    "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
1227
                        max_length, input_length
1228
1229
1230
                    )
                )

1231
            summaries = self.model.generate(
1232
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
1233
            )
1234

1235
1236
1237
1238
1239
1240
1241
            results = []
            for summary in summaries:
                record = {}
                if return_tensors:
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results


class TranslationPipeline(Pipeline):
    """
    Translates from one language to another.

    Usage::
        en_fr_translator = pipeline("translation_en_to_fr")
        en_fr_translator("How old are you?")

1256
1257
1258
1259
    The models that this pipeline can use are models that have been fine-tuned on a translation task,
    currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"
    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291

    Arguments:
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
            checkpoint identifier or an actual pre-trained model inheriting from
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.
            If :obj:`None`, the default of the pipeline will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
            a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.
            If :obj:`None`, the default of the pipeline will be loaded.
        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
            Model card attributed to the model for this pipeline.
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.
            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to :obj:`-1`):
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.
    """

    def __call__(
        self, *texts, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs
    ):
        r"""
        Args:
Patrick von Platen's avatar
Patrick von Platen committed
1292
            *texts: (list of strings) texts to be translated
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
            return_text: (bool, default=True) whether to add a decoded "translation_text" to each result
            return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result

            **generate_kwargs: extra kwargs passed to `self.model.generate`_

        Returns:
            list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate
        .. _`self.model.generate`:
            https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate
        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"

        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""

        if isinstance(texts[0], list):
            assert (
                self.tokenizer.pad_token_id is not None
            ), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
            texts = ([prefix + text for text in texts[0]],)
            pad_to_max_length = True

        elif isinstance(texts[0], str):
            texts = (prefix + texts[0],)
            pad_to_max_length = False
        else:
            raise ValueError(
                " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
                    texts[0]
                )
            )

        with self.device_placement():
            inputs = self._parse_and_tokenize(*texts, pad_to_max_length=pad_to_max_length)

            if self.framework == "pt":
                inputs = self.ensure_tensor_on_device(**inputs)
                input_length = inputs["input_ids"].shape[-1]

            elif self.framework == "tf":
                input_length = tf.shape(inputs["input_ids"])[-1].numpy()

1334
1335
            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
            if input_length > 0.9 * max_length:
1336
1337
                logger.warning(
                    "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
1338
                        input_length, max_length
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
                    )
                )

            translations = self.model.generate(
                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
            )
            results = []
            for translation in translations:
                record = {}
                if return_tensors:
                    record["translation_token_ids"] = translation
                if return_text:
                    record["translation_text"] = self.tokenizer.decode(
                        translation,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1355
1356
1357
1358
1359
                    )
                results.append(record)
            return results


Morgan Funtowicz's avatar
Morgan Funtowicz committed
1360
1361
# Register all the supported task here
SUPPORTED_TASKS = {
1362
1363
1364
1365
1366
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
        "tf": TFAutoModel if is_tf_available() else None,
        "pt": AutoModel if is_torch_available() else None,
        "default": {
1367
            "model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"},
1368
            "config": None,
1369
            "tokenizer": "distilbert-base-cased",
1370
        },
1371
    },
1372
1373
1374
1375
1376
1377
    "sentiment-analysis": {
        "impl": TextClassificationPipeline,
        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
        "default": {
            "model": {
1378
1379
                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
1380
            },
1381
            "config": "distilbert-base-uncased-finetuned-sst-2-english",
Funtowicz Morgan's avatar
Funtowicz Morgan committed
1382
            "tokenizer": "distilbert-base-cased",
1383
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1384
    },
1385
1386
1387
1388
1389
1390
    "ner": {
        "impl": NerPipeline,
        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
        "pt": AutoModelForTokenClassification if is_torch_available() else None,
        "default": {
            "model": {
Julien Chaumond's avatar
Julien Chaumond committed
1391
1392
                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
1393
            },
Julien Chaumond's avatar
Julien Chaumond committed
1394
            "config": "dbmdz/bert-large-cased-finetuned-conll03-english",
1395
1396
            "tokenizer": "bert-large-cased",
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1397
    },
1398
1399
1400
1401
1402
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
        "default": {
Lysandre's avatar
E231  
Lysandre committed
1403
            "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
1404
            "config": None,
1405
            "tokenizer": ("distilbert-base-cased", {"use_fast": False}),
1406
1407
        },
    },
Julien Chaumond's avatar
Julien Chaumond committed
1408
1409
1410
1411
1412
1413
1414
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
        "default": {
            "model": {"pt": "distilroberta-base", "tf": "distilroberta-base"},
            "config": None,
1415
            "tokenizer": ("distilroberta-base", {"use_fast": False}),
Julien Chaumond's avatar
Julien Chaumond committed
1416
1417
        },
    },
1418
1419
    "summarization": {
        "impl": SummarizationPipeline,
1420
1421
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
1422
1423
1424
1425
1426
1427
        "default": {
            "model": {"pt": "bart-large-cnn", "tf": None},
            "config": None,
            "tokenizer": ("bart-large-cnn", {"use_fast": False}),
        },
    },
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
    "translation_en_to_fr": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
        "default": {
            "model": {"pt": "t5-base", "tf": "t5-base"},
            "config": None,
            "tokenizer": ("t5-base", {"use_fast": False}),
        },
    },
    "translation_en_to_de": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
        "default": {
            "model": {"pt": "t5-base", "tf": "t5-base"},
            "config": None,
            "tokenizer": ("t5-base", {"use_fast": False}),
        },
    },
    "translation_en_to_ro": {
        "impl": TranslationPipeline,
        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
        "pt": AutoModelWithLMHead if is_torch_available() else None,
        "default": {
            "model": {"pt": "t5-base", "tf": "t5-base"},
            "config": None,
            "tokenizer": ("t5-base", {"use_fast": False}),
        },
    },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1458
1459
1460
}


1461
1462
1463
1464
1465
def pipeline(
    task: str,
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
1466
    framework: Optional[str] = None,
1467
1468
    **kwargs
) -> Pipeline:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1469
    """
1470
    Utility factory method to build a pipeline.
Lysandre Debut's avatar
Lysandre Debut committed
1471

1472
    Pipeline are made of:
1473

Lysandre Debut's avatar
Lysandre Debut committed
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
        - A Tokenizer instance in charge of mapping raw textual input to token
        - A Model instance
        - Some (optional) post processing for enhancing model's output


    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:

            - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
            - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
            - "ner": will return a :class:`~transformers.NerPipeline`
            - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
            - "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
1488
1489
            - "summarization": will return a :class:`~transformers.SummarizationPipeline`
            - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline`
Lysandre Debut's avatar
Lysandre Debut committed
1490
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
1491
1492
            The model that will be used by the pipeline to make predictions. This can be :obj:`None`,
            a model identifier or an actual pre-trained model inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1493
1494
1495
            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
            TensorFlow.

1496
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1497
1498
        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
            The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
1499
            a model identifier or an actual pre-trained model configuration inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1500
1501
            :class:`~transformers.PretrainedConfig`.

1502
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1503
1504
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
1505
            a model identifier or an actual pre-trained tokenizer inheriting from
Lysandre Debut's avatar
Lysandre Debut committed
1506
1507
            :class:`~transformers.PreTrainedTokenizer`.

1508
            If :obj:`None`, the default for this pipeline will be loaded.
Lysandre Debut's avatar
Lysandre Debut committed
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
        framework (:obj:`str`, `optional`, defaults to :obj:`None`):
            The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
            installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to PyTorch.

    Returns:
        :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
        the task.

    Examples::

        from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

        # Sentiment analysis pipeline
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1525
        pipeline('sentiment-analysis')
Lysandre Debut's avatar
Lysandre Debut committed
1526
1527

        # Question answering pipeline, specifying the checkpoint identifier
1528
        pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
Lysandre Debut's avatar
Lysandre Debut committed
1529
1530
1531
1532
1533

        # Named entity recognition pipeline, passing in a specific model and tokenizer
        model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        pipeline('ner', model=model, tokenizer=tokenizer)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1534
    """
1535
    # Retrieve the task
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1536
1537
1538
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

1539
    framework = framework or get_framework(model)
1540

Morgan Funtowicz's avatar
Morgan Funtowicz committed
1541
    targeted_task = SUPPORTED_TASKS[task]
1542
    task_class, model_class = targeted_task["impl"], targeted_task[framework]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1543

1544
    # Use default model/config/tokenizer for the task if no model is provided
1545
    if model is None:
1546
        models, config, tokenizer = [targeted_task["default"][k] for k in ["model", "config", "tokenizer"]]
1547
        model = models[framework]
1548

1549
1550
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
thomwolf's avatar
thomwolf committed
1551
        if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
1552
            tokenizer = model
thomwolf's avatar
thomwolf committed
1553
        elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
1554
1555
1556
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
1557
1558
            raise Exception(
                "Impossible to guess which tokenizer to use. "
1559
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
1560
            )
1561

Lysandre Debut's avatar
Lysandre Debut committed
1562
    modelcard = None
1563
    # Try to infer modelcard from model or config name (if provided as str)
Lysandre Debut's avatar
Lysandre Debut committed
1564
1565
1566
1567
    if isinstance(model, str):
        modelcard = model
    elif isinstance(config, str):
        modelcard = config
1568
1569

    # Instantiate tokenizer if needed
1570
1571
1572
1573
1574
1575
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
1576
1577
1578
1579

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)
1580

thomwolf's avatar
thomwolf committed
1581
1582
1583
1584
    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

1585
    # Instantiate model if needed
1586
    if isinstance(model, str):
1587
1588
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch."
            )
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow."
            )
1601
        model = model_class.from_pretrained(model, config=config, **model_kwargs)
1602

1603
    return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)