pipelines.py 37 KB
Newer Older
Morgan Funtowicz's avatar
Morgan Funtowicz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

Morgan Funtowicz's avatar
Morgan Funtowicz committed
16

17
18
import csv
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import logging
Morgan Funtowicz's avatar
Morgan Funtowicz committed
20
import os
21
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
22
import sys
Morgan Funtowicz's avatar
Morgan Funtowicz committed
23
from abc import ABC, abstractmethod
24
from contextlib import contextmanager
25
from os.path import abspath, exists
Aymeric Augustin's avatar
Aymeric Augustin committed
26
from typing import Dict, List, Optional, Tuple, Union
Morgan Funtowicz's avatar
Morgan Funtowicz committed
27
28
29

import numpy as np

30
31
32
33
34
35
36
37
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
Morgan Funtowicz's avatar
Morgan Funtowicz committed
38

Aymeric Augustin's avatar
Aymeric Augustin committed
39

Morgan Funtowicz's avatar
Morgan Funtowicz committed
40
if is_tf_available():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
41
    import tensorflow as tf
42
    from .modeling_tf_auto import (
43
44
45
46
47
        TFAutoModel,
        TFAutoModelForSequenceClassification,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForTokenClassification,
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
48
49
50

if is_torch_available():
    import torch
51
    from .modeling_auto import (
52
53
54
55
56
        AutoModel,
        AutoModelForSequenceClassification,
        AutoModelForQuestionAnswering,
        AutoModelForTokenClassification,
    )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
57
58


59
60
logger = logging.getLogger(__name__)

61

thomwolf's avatar
thomwolf committed
62
def get_framework(model=None):
63
    """ Select framework (TensorFlow/PyTorch) to use.
64
        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
65
    """
thomwolf's avatar
thomwolf committed
66
67
68
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
        # Both framework are available but the use supplied a model class instance.
        # Try to guess which framework to use from the model classname
69
        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
70
    elif not is_tf_available() and not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
71
        raise RuntimeError(
72
73
74
75
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
76
    else:
77
        # framework = 'tf' if is_tf_available() else 'pt'
78
        framework = "pt" if is_torch_available() else "tf"
thomwolf's avatar
thomwolf committed
79
80
    return framework

81

82
83
84
85
class ArgumentHandler(ABC):
    """
    Base interface for handling varargs for each Pipeline
    """
86

87
88
89
    @abstractmethod
    def __call__(self, *args, **kwargs):
        raise NotImplementedError()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
90
91


92
93
94
95
class DefaultArgumentHandler(ArgumentHandler):
    """
    Default varargs argument parser handling parameters for each Pipeline
    """
96

97
    def __call__(self, *args, **kwargs):
98
99
100
101
        if "X" in kwargs:
            return kwargs["X"]
        elif "data" in kwargs:
            return kwargs["data"]
102
103
104
105
106
107
        elif len(args) == 1:
            if isinstance(args[0], list):
                return args[0]
            else:
                return [args[0]]
        elif len(args) > 1:
108
            return list(args)
109
        raise ValueError("Unable to infer the format of the provided data (X=, data=, ...)")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
110
111


112
class PipelineDataFormat:
113
114
115
116
117
    """
    Base class for all the pipeline supported data format both for reading and writing.
    Supported data formats currently includes:
     - JSON
     - CSV
thomwolf's avatar
thomwolf committed
118
     - stdin/stdout (pipe)
119
120
121
122

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
123
124

    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
125

thomwolf's avatar
thomwolf committed
126
    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
thomwolf's avatar
thomwolf committed
127
128
        self.output_path = output_path
        self.input_path = input_path
129
        self.column = column.split(",") if column is not None else [""]
130
131
132
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
133
            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
134

thomwolf's avatar
thomwolf committed
135
        if output_path is not None and not overwrite:
thomwolf's avatar
thomwolf committed
136
            if exists(abspath(self.output_path)):
137
                raise OSError("{} already exists on disk".format(self.output_path))
138

thomwolf's avatar
thomwolf committed
139
140
        if input_path is not None:
            if not exists(abspath(self.input_path)):
141
                raise OSError("{} doesnt exist on disk".format(self.input_path))
142
143
144
145
146
147
148

    @abstractmethod
    def __iter__(self):
        raise NotImplementedError()

    @abstractmethod
    def save(self, data: dict):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
149
150
151
152
153
        """
        Save the provided data object with the representation for the current `DataFormat`.
        :param data: data to store
        :return:
        """
154
155
        raise NotImplementedError()

156
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
157
158
159
160
161
        """
        Save the provided data object as a pickle-formatted binary data on the disk.
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
thomwolf's avatar
thomwolf committed
162
        path, _ = os.path.splitext(self.output_path)
163
        binary_path = os.path.extsep.join((path, "pickle"))
164

165
        with open(binary_path, "wb+") as f_output:
166
167
168
169
            pickle.dump(data, f_output)

        return binary_path

170
    @staticmethod
171
172
173
174
    def from_str(
        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False
    ):
        if format == "json":
thomwolf's avatar
thomwolf committed
175
            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
176
        elif format == "csv":
thomwolf's avatar
thomwolf committed
177
            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
178
        elif format == "pipe":
thomwolf's avatar
thomwolf committed
179
            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
180
        else:
181
            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
182
183
184


class CsvPipelineDataFormat(PipelineDataFormat):
thomwolf's avatar
thomwolf committed
185
186
    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
        super().__init__(output_path, input_path, column, overwrite=overwrite)
187
188

    def __iter__(self):
189
        with open(self.input_path, "r") as f:
190
191
192
193
194
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
                    yield {k: row[c] for k, c in self.column}
                else:
195
                    yield row[self.column[0]]
196
197

    def save(self, data: List[dict]):
198
        with open(self.output_path, "w") as f:
199
200
201
202
203
204
205
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
                writer.writerows(data)


class JsonPipelineDataFormat(PipelineDataFormat):
thomwolf's avatar
thomwolf committed
206
207
    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
        super().__init__(output_path, input_path, column, overwrite=overwrite)
208

209
        with open(input_path, "r") as f:
210
211
212
213
214
215
216
            self._entries = json.load(f)

    def __iter__(self):
        for entry in self._entries:
            if self.is_multi_columns:
                yield {k: entry[c] for k, c in self.column}
            else:
217
                yield entry[self.column[0]]
218
219

    def save(self, data: dict):
220
        with open(self.output_path, "w") as f:
221
222
223
            json.dump(data, f)


Morgan Funtowicz's avatar
Morgan Funtowicz committed
224
225
226
227
228
229
230
class PipedPipelineDataFormat(PipelineDataFormat):
    """
    Read data from piped input to the python process.
    For multi columns data, columns should separated by \t

    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
231

Morgan Funtowicz's avatar
Morgan Funtowicz committed
232
233
234
    def __iter__(self):
        for line in sys.stdin:
            # Split for multi-columns
235
            if "\t" in line:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
236

237
                line = line.split("\t")
Morgan Funtowicz's avatar
Morgan Funtowicz committed
238
239
240
241
242
243
244
245
246
247
248
249
250
                if self.column:
                    # Dictionary to map arguments
                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
                else:
                    yield tuple(line)

            # No dictionary to map arguments
            else:
                yield line

    def save(self, data: dict):
        print(data)

251
    def save_binary(self, data: Union[dict, List[dict]]) -> str:
thomwolf's avatar
thomwolf committed
252
        if self.output_path is None:
253
            raise KeyError(
254
255
                "When using piped input on pipeline outputting large object requires an output file path. "
                "Please provide such output path through --output argument."
256
257
258
259
            )

        return super().save_binary(data)

Morgan Funtowicz's avatar
Morgan Funtowicz committed
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

class _ScikitCompat(ABC):
    """
    Interface layer for the Scikit and Keras compatibility.
    """

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError()

    @abstractmethod
    def predict(self, X):
        raise NotImplementedError()


275
class Pipeline(_ScikitCompat):
276
277
278
279
    """
    Base class implementing pipelined operations.
    Pipeline workflow is defined as a sequence of the following operations:
        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Morgan Funtowicz's avatar
Morgan Funtowicz committed
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315

    Pipeline supports running on CPU or GPU through the device argument. Users can specify
    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.

    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
    provide the binary_output constructor argument. If set to True, the output will be stored in the
    pickle format.

    Arguments:
        **model**: ``(str, PretrainedModel, TFPretrainedModel)``:
            Reference to the model to use through this pipeline.

        **tokenizer**: ``(str, PreTrainedTokenizer)``:
            Reference to the tokenizer to use through this pipeline.

        **args_parser**: ``ArgumentHandler``:
            Reference to the object in charge of parsing supplied pipeline parameters.

        **device**: ``int``:
            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
            on the associated CUDA device id.

        **binary_output** ``bool`` (default: False):
            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.

    Return:
        Pipeline returns list or dictionary depending on:
         - Does the user provided multiple sample
         - The pipeline expose multiple fields in the output object

    Examples:
        nlp = pipeline('ner')
        nlp = pipeline('ner', model='...', config='...', tokenizer='...')
        nlp = NerPipeline(model='...', config='...', tokenizer='...')
        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
316
    """
thomwolf's avatar
thomwolf committed
317
318
319

    default_input_names = None

320
321
322
323
324
325
326
327
328
329
    def __init__(
        self,
        model,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: ModelCard = None,
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
    ):
330

thomwolf's avatar
thomwolf committed
331
332
333
        if framework is None:
            framework = get_framework()

334
335
        self.model = model
        self.tokenizer = tokenizer
336
        self.modelcard = modelcard
thomwolf's avatar
thomwolf committed
337
        self.framework = framework
338
        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
339
        self.binary_output = binary_output
340
341
        self._args_parser = args_parser or DefaultArgumentHandler()

342
        # Special handling
343
344
        if self.framework == "pt" and self.device.type == "cuda":
            self.model = self.model.to(self.device)
345

346
    def save_pretrained(self, save_directory):
347
348
349
        """
        Save the pipeline's model and tokenizer to the specified save_directory
        """
350
351
352
353
354
355
        if not os.path.isdir(save_directory):
            logger.error("Provided path ({}) should be a directory".format(save_directory))
            return

        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)
356
        self.modelcard.save_pretrained(save_directory)
357
358

    def transform(self, X):
359
360
361
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        """
362
363
364
        return self(X=X)

    def predict(self, X):
365
366
367
368
        """
        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
        Se
        """
369
        return self(X=X)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
370

371
372
    @contextmanager
    def device_placement(self):
373
374
375
376
377
378
379
380
381
382
383
        """
        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
        example:
            # Explicitly ask for tensor allocation on CUDA device :0
            nlp = pipeline(..., device=0)
            with nlp.device_placement():
                # Every framework specific tensor allocation will be done on the request device
                output = nlp(...)
        Returns:
            Context manager
        """
384
385
        if self.framework == "tf":
            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
386
387
                yield
        else:
388
            if self.device.type == "cuda":
389
                torch.cuda.set_device(self.device)
390

391
            yield
392

393
394
395
396
397
398
399
400
    def ensure_tensor_on_device(self, **inputs):
        """
        Ensure PyTorch tensors are on the specified device.
        :param inputs:
        :return:
        """
        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

401
402
403
404
405
406
407
    def inputs_for_model(self, features: Union[dict, List[dict]]) -> Dict:
        """
        Generates the input dictionary with model-specific parameters.

        Returns:
            dict holding all the required parameters for model's forward
        """
408
        args = ["input_ids", "attention_mask"]
409
410
        model_type = type(self.model).__name__.lower()

411
412
        if "distilbert" not in model_type and "xlm" not in model_type:
            args += ["token_type_ids"]
413

Morgan Funtowicz's avatar
Morgan Funtowicz committed
414
415
416
        # PR #1548 (CLI) There is an issue with attention_mask
        # if 'xlnet' in model_type or 'xlm' in model_type:
        #     args += ['cls_index', 'p_mask']
417
418
419
420
421
422

        if isinstance(features, dict):
            return {k: features[k] for k in args}
        else:
            return {k: [feature[k] for feature in features] for k in args}

Morgan Funtowicz's avatar
Morgan Funtowicz committed
423
424
425
    def __call__(self, *texts, **kwargs):
        # Parse arguments
        inputs = self._args_parser(*texts, **kwargs)
426
427
428
        inputs = self.tokenizer.batch_encode_plus(
            inputs, add_special_tokens=True, return_tensors=self.framework, max_length=self.tokenizer.max_len
        )
Morgan Funtowicz's avatar
Morgan Funtowicz committed
429

430
431
432
        # Filter out features not available on specific models
        inputs = self.inputs_for_model(inputs)
        return self._forward(inputs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
433

434
    def _forward(self, inputs):
435
436
437
438
439
440
441
        """
        Internal framework specific forward dispatching.
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
        Returns:
            Numpy array
        """
442
443
444
445
446
447
448
449
450
        # Encode for forward
        with self.device_placement():
            if self.framework == "tf":
                # TODO trace model
                predictions = self.model(inputs, training=False)[0]
            else:
                with torch.no_grad():
                    inputs = self.ensure_tensor_on_device(**inputs)
                    predictions = self.model(**inputs)[0].cpu()
451

452
453
454
455
        return predictions.numpy()


class FeatureExtractionPipeline(Pipeline):
456
457
458
    """
    Feature extraction pipeline using Model head.
    """
459

460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
    def __init__(
        self,
        model,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: ModelCard = None,
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=True,
        )
478

479
480
    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs).tolist()
Morgan Funtowicz's avatar
Morgan Funtowicz committed
481
482


Morgan Funtowicz's avatar
Morgan Funtowicz committed
483
class TextClassificationPipeline(Pipeline):
484
485
486
    """
    Text classification pipeline using ModelForTextClassification head.
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
487

488
    def __call__(self, *args, **kwargs):
489
490
        outputs = super().__call__(*args, **kwargs)
        scores = np.exp(outputs) / np.exp(outputs).sum(-1)
491
        return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
492
493
494


class NerPipeline(Pipeline):
495
496
497
    """
    Named Entity Recognition pipeline using ModelForTokenClassification head.
    """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
498

499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
    default_input_names = "sequences"

    def __init__(
        self,
        model,
        tokenizer: PreTrainedTokenizer = None,
        modelcard: ModelCard = None,
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = None,
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
        )
521
522

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
523
        self.ignore_labels = ignore_labels
524

Morgan Funtowicz's avatar
Morgan Funtowicz committed
525
    def __call__(self, *texts, **kwargs):
526
        inputs, answers = self._args_parser(*texts, **kwargs), []
527
        for sentence in inputs:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
528

529
530
            # Manage correct placement of the tensors
            with self.device_placement():
Morgan Funtowicz's avatar
Morgan Funtowicz committed
531

532
                tokens = self.tokenizer.encode_plus(
533
534
                    sentence,
                    return_attention_mask=False,
thomwolf's avatar
thomwolf committed
535
                    return_tensors=self.framework,
536
                    max_length=self.tokenizer.max_len,
537
                )
538
539

                # Forward
540
                if self.framework == "tf":
Morgan Funtowicz's avatar
Morgan Funtowicz committed
541
                    entities = self.model(tokens)[0][0].numpy()
542
                    input_ids = tokens["input_ids"].numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
543
                else:
544
                    with torch.no_grad():
545
                        tokens = self.ensure_tensor_on_device(**tokens)
546
                        entities = self.model(**tokens)[0][0].cpu().numpy()
547
                        input_ids = tokens["input_ids"].cpu().numpy()[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
548

thomwolf's avatar
thomwolf committed
549
550
            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
551

thomwolf's avatar
thomwolf committed
552
553
554
            answer = []
            for idx, label_idx in enumerate(labels_idx):
                if self.model.config.id2label[label_idx] not in self.ignore_labels:
555
556
557
558
559
560
561
                    answer += [
                        {
                            "word": self.tokenizer.decode([int(input_ids[idx])]),
                            "score": score[idx][label_idx].item(),
                            "entity": self.model.config.id2label[label_idx],
                        }
                    ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
562
563
564

            # Append
            answers += [answer]
thomwolf's avatar
thomwolf committed
565
566
        if len(answers) == 1:
            return answers[0]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
567
568
569
        return answers


570
571
572
573
574
575
576
577
class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
    to internal SquadExample / SquadFeature structures.

    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
    arguments.
    """
578

579
580
581
582
    def __call__(self, *args, **kwargs):
        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
        if args is not None and len(args) > 0:
            if len(args) == 1:
583
                kwargs["X"] = args[0]
584
            else:
585
                kwargs["X"] = list(args)
586

Morgan Funtowicz's avatar
Morgan Funtowicz committed
587
588
        # Generic compatibility with sklearn and Keras
        # Batched data
589
590
        if "X" in kwargs or "data" in kwargs:
            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
591

Morgan Funtowicz's avatar
Morgan Funtowicz committed
592
593
594
595
596
            if isinstance(inputs, dict):
                inputs = [inputs]
            else:
                # Copy to avoid overriding arguments
                inputs = [i for i in inputs]
597

Morgan Funtowicz's avatar
Morgan Funtowicz committed
598
            for i, item in enumerate(inputs):
599
                if isinstance(item, dict):
600
601
                    if any(k not in item for k in ["question", "context"]):
                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
602

Morgan Funtowicz's avatar
Morgan Funtowicz committed
603
604
605
                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)

                elif not isinstance(item, SquadExample):
606
                    raise ValueError(
607
608
609
                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
                            "X" if "X" in kwargs else "data"
                        )
610
611
612
                    )

            # Tabular input
613
614
615
        elif "question" in kwargs and "context" in kwargs:
            if isinstance(kwargs["question"], str):
                kwargs["question"] = [kwargs["question"]]
616

617
618
            if isinstance(kwargs["context"], str):
                kwargs["context"] = [kwargs["context"]]
619

620
621
622
            inputs = [
                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
            ]
623
        else:
624
            raise ValueError("Unknown arguments {}".format(kwargs))
625
626
627
628
629
630
631

        if not isinstance(inputs, list):
            inputs = [inputs]

        return inputs


Morgan Funtowicz's avatar
Morgan Funtowicz committed
632
633
class QuestionAnsweringPipeline(Pipeline):
    """
634
    Question Answering pipeline using ModelForQuestionAnswering head.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
635
636
    """

637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
    default_input_names = "question,context"

    def __init__(
        self,
        model,
        tokenizer: Optional[PreTrainedTokenizer],
        modelcard: Optional[ModelCard],
        framework: Optional[str] = None,
        device: int = -1,
        **kwargs
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=QuestionAnsweringArgumentHandler(),
            device=device,
655
            **kwargs,
656
        )
thomwolf's avatar
thomwolf committed
657

Morgan Funtowicz's avatar
Morgan Funtowicz committed
658
    @staticmethod
659
660
661
    def create_sample(
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
662
663
664
665
        """
        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
        We currently support extractive question answering.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
666
        Arguments:
667
668
             question: (str, List[str]) The question to be ask for the associated context
             context: (str, List[str]) The context in which we will look for the answer.
Morgan Funtowicz's avatar
Morgan Funtowicz committed
669
670
671

        Returns:
            SquadExample initialized with the corresponding question and context.
672
673
        """
        if isinstance(question, list):
Morgan Funtowicz's avatar
Morgan Funtowicz committed
674
675
676
677
678
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
        else:
            return SquadExample(None, question, context, None, None, None)

    def __call__(self, *texts, **kwargs):
679
680
681
682
683
684
685
686
687
688
689
690
691
692
        """
        Args:
            We support multiple use-cases, the following are exclusive:
            X: sequence of SquadExample
            data: sequence of SquadExample
            question: (str, List[str]), batch of question(s) to map along with context
            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
        Returns:
            dict: {'answer': str, 'score": float, 'start": int, "end": int}
            answer: the textual answer in the intial context
            score: the score the current answer scored for the model
            start: the character index in the original string corresponding to the beginning of the answer' span
            end: the character index in the original string corresponding to the ending of the answer' span
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
693
        # Set defaults values
694
695
696
697
698
        kwargs.setdefault("topk", 1)
        kwargs.setdefault("doc_stride", 128)
        kwargs.setdefault("max_answer_len", 15)
        kwargs.setdefault("max_seq_len", 384)
        kwargs.setdefault("max_question_len", 64)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
699

700
701
        if kwargs["topk"] < 1:
            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
702

703
704
        if kwargs["max_answer_len"] < 1:
            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
Morgan Funtowicz's avatar
Morgan Funtowicz committed
705
706

        # Convert inputs to features
707
        examples = self._args_parser(*texts, **kwargs)
Morgan Funtowicz's avatar
Morgan Funtowicz committed
708
709
710
711
712
713
714
715
716
717
718
        features_list = [
            squad_convert_examples_to_features(
                [example],
                self.tokenizer,
                kwargs["max_seq_len"],
                kwargs["doc_stride"],
                kwargs["max_question_len"],
                False,
            )
            for example in examples
        ]
Rishabh Manoj's avatar
Rishabh Manoj committed
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
        all_answers = []
        for features, example in zip(features_list, examples):
            fw_args = self.inputs_for_model([f.__dict__ for f in features])

            # Manage tensor allocation on correct device
            with self.device_placement():
                if self.framework == "tf":
                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
                    start, end = self.model(fw_args)
                    start, end = start.numpy(), end.numpy()
                else:
                    with torch.no_grad():
                        # Retrieve the score for the context tokens only (removing question tokens)
                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
                        start, end = self.model(**fw_args)
                        start, end = start.cpu().numpy(), end.cpu().numpy()

            answers = []
            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
Morgan Funtowicz's avatar
Morgan Funtowicz committed
743
744
745
746
                start_, end_ = (
                    start_ * np.abs(np.array(feature.p_mask) - 1),
                    end_ * np.abs(np.array(feature.p_mask) - 1),
                )
Rishabh Manoj's avatar
Rishabh Manoj committed
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766

                # TODO : What happens if not possible
                # Mask CLS
                start_[0] = end_[0] = 0

                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
                char_to_word = np.array(example.char_to_word_offset)

                # Convert the answer (tokens) back to the original text
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                        ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
767
768
769
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
            all_answers += answers

Rishabh Manoj's avatar
Rishabh Manoj committed
770
        if len(all_answers) == 1:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
771
            return all_answers[0]
Rishabh Manoj's avatar
Rishabh Manoj committed
772
        return all_answers
Morgan Funtowicz's avatar
Morgan Funtowicz committed
773
774

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
775
776
777
778
779
780
781
782
783
784
785
786
787
        """
        Take the output of any QuestionAnswering head and will generate probalities for each span to be
        the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than
        max_answer_len or answer end position being before the starting position.
        The method supports output the k-best answer through the topk argument.

        Args:
            start: numpy array, holding individual start probabilities for each token
            end: numpy array, holding individual end probabilities for each token
            topk: int, indicates how many possible answer span(s) to extract from the model's output
            max_answer_len: int, maximum size of the answer to extract from the model's output
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), max_answer_len - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if topk == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < topk:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, topk)[0:topk]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end]

    def span_to_answer(self, text: str, start: int, end: int):
815
816
817
818
819
820
821
822
823
824
825
826
        """
        When decoding from token probalities, this method maps token indexes to actual word in
        the initial context.

        Args:
            text: str, the actual context to extract the answer from
            start: int, starting answer token index
            end: int, ending answer token index

        Returns:
            dict: {'answer': str, 'start': int, 'end': int}
        """
Morgan Funtowicz's avatar
Morgan Funtowicz committed
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
        words = []
        token_idx = char_start_idx = char_end_idx = chars_idx = 0

        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)

            # Append words if they are in the span
            if start <= token_idx <= end:
                if token_idx == start:
                    char_start_idx = chars_idx

                if token_idx == end:
                    char_end_idx = chars_idx + len(word)

                words += [word]

            # Stop if we went over the end of the answer
            if token_idx > end:
                break

            # Append the subtokenization length to the running index
            token_idx += len(token)
            chars_idx += len(word) + 1

        # Join text with spaces
852
        return {"answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx)}
Morgan Funtowicz's avatar
Morgan Funtowicz committed
853
854
855
856


# Register all the supported task here
SUPPORTED_TASKS = {
857
858
859
860
861
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
        "tf": TFAutoModel if is_tf_available() else None,
        "pt": AutoModel if is_torch_available() else None,
        "default": {
862
            "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased"},
863
864
865
            "config": None,
            "tokenizer": "distilbert-base-uncased",
        },
866
    },
867
868
869
870
871
872
873
874
    "sentiment-analysis": {
        "impl": TextClassificationPipeline,
        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
        "default": {
            "model": {
                "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
                "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
875
            },
876
877
878
            "config": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
            "tokenizer": "distilbert-base-uncased",
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
879
    },
880
881
882
883
884
885
886
887
    "ner": {
        "impl": NerPipeline,
        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
        "pt": AutoModelForTokenClassification if is_torch_available() else None,
        "default": {
            "model": {
                "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin",
                "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5",
888
            },
889
890
891
            "config": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json",
            "tokenizer": "bert-large-cased",
        },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
892
    },
893
894
895
896
897
898
899
900
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
        "default": {
            "model": {
                "pt": "distilbert-base-uncased-distilled-squad",
                "tf": "distilbert-base-uncased-distilled-squad",
901
            },
902
903
904
905
            "config": None,
            "tokenizer": "distilbert-base-uncased",
        },
    },
Morgan Funtowicz's avatar
Morgan Funtowicz committed
906
907
908
}


909
910
911
912
913
914
915
916
def pipeline(
    task: str,
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
    modelcard: Optional[Union[str, ModelCard]] = None,
    **kwargs
) -> Pipeline:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
917
    """
918
919
920
921
922
    Utility factory method to build a pipeline.
    Pipeline are made of:
        A Tokenizer instance in charge of mapping raw textual input to token
        A Model instance
        Some (optional) post processing for enhancing model's output
923
924

    Examples:
Morgan Funtowicz's avatar
Morgan Funtowicz committed
925
926
927
928
        pipeline('sentiment-analysis')
        pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased')
        pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
        pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
Morgan Funtowicz's avatar
Morgan Funtowicz committed
929
    """
930
    # Retrieve the task
Morgan Funtowicz's avatar
Morgan Funtowicz committed
931
932
933
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

thomwolf's avatar
thomwolf committed
934
    framework = get_framework(model)
935

Morgan Funtowicz's avatar
Morgan Funtowicz committed
936
    targeted_task = SUPPORTED_TASKS[task]
937
    task, model_class = targeted_task["impl"], targeted_task[framework]
Morgan Funtowicz's avatar
Morgan Funtowicz committed
938

939
    # Use default model/config/tokenizer for the task if no model is provided
940
    if model is None:
941
        models, config, tokenizer = tuple(targeted_task["default"].values())
942
        model = models[framework]
943

944
945
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
thomwolf's avatar
thomwolf committed
946
        if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
947
            tokenizer = model
thomwolf's avatar
thomwolf committed
948
        elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
949
950
951
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
952
953
954
955
            raise Exception(
                "Impossible to guess which tokenizer to use. "
                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
            )
956
957
958
959
960
961
962
963
964
965

    # Try to infer modelcard from model or config name (if provided as str)
    if modelcard is None:
        # Try to fallback on one of the provided string for model or config (will replace the suffix)
        if isinstance(model, str):
            modelcard = model
        elif isinstance(config, str):
            modelcard = config

    # Instantiate tokenizer if needed
Aymeric Augustin's avatar
Aymeric Augustin committed
966
    if isinstance(tokenizer, str):
967
968
969
970
971
        tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)
972

thomwolf's avatar
thomwolf committed
973
974
975
976
    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

977
    # Instantiate model if needed
978
    if isinstance(model, str):
979
980
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
981
982
983
984
985
986
987
988
989
990
991
992
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch."
            )
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow."
            )
993
        model = model_class.from_pretrained(model, config=config, **model_kwargs)
994

thomwolf's avatar
thomwolf committed
995
    return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)