Added QuestionAnsweringPipeline with batch support.

e1d89cb2 · Morgan Funtowicz · 81babb22 · e1d89cb2 · 81babb22 · e1d89cb2
Commit e1d89cb2 authored Dec 06, 2019 by Morgan Funtowicz
Showing with 226 additions and 232 deletions

transformers/__init__.py transformers/__init__.py +4 -3

transformers/pipeline.py transformers/pipeline.py +0 -229

transformers/pipelines.py transformers/pipelines.py +222 -0

No files found.
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -65,9 +65,6 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO
 from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP

-# Pipelines
-from .pipeline import TextClassificationPipeline
-
 # Modeling
 if is_torch_available():
    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
@@ -193,6 +190,10 @@ from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name
                                        load_tf2_weights_in_pytorch_model,
                                        load_tf2_model_in_pytorch_model)

+# Pipelines
+# from .pipeline_ import TextClassificationPipeline
+from .pipelines import Pipeline, pipeline, TextClassificationPipeline
+
 if not is_tf_available() and not is_torch_available():
    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
                   "Models won't be available and only tokenizers, configuration"

--- a/transformers/pipeline.py
+++ b/transformers/pipeline.py
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Pipeline class: Tokenizer + Model. """
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-import os
-import logging
-import six
-
-from .tokenization_auto import AutoTokenizer
-from .file_utils import add_start_docstrings, is_tf_available, is_torch_available
-from .data.processors import SingleSentenceClassificationProcessor
-
-if is_tf_available():
-    import tensorflow as tf
-    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForQuestionAnswering,
-                                    TFAutoModelForSequenceClassification,
-                                    TFAutoModelWithLMHead)
-if is_torch_available():
-    import torch
-    from .modeling_auto import (AutoModel, AutoModelForQuestionAnswering,
-                                AutoModelForSequenceClassification,
-                                AutoModelWithLMHead)
-
-logger = logging.getLogger(__name__)
-
-# TF training parameters
-USE_XLA = False
-USE_AMP = False
-
-class TextClassificationPipeline(object):
-    r"""
-        :class:`~transformers.TextClassificationPipeline` is a class encapsulating a pretrained model and
-        its tokenizer and will be instantiated as one of the base model classes of the library
-        when created with the `Pipeline.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertModel (DistilBERT model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
-            - contains `bert`: BertModel (Bert model)
-            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
-            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
-            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
-            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
-            - contains `xlnet`: XLNetModel (XLNet model)
-            - contains `xlm`: XLMModel (XLM model)
-    """
-    def __init__(self, tokenizer, model, is_compiled=False, is_trained=False):
-        self.tokenizer = tokenizer
-        self.model = model
-        self.is_compiled = is_compiled
-        self.is_trained = is_trained
-
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiates a pipeline from a pre-trained tokenizer and model.
-        """
-        # Extract tokenizer and model arguments
-        tokenizer_kwargs = {}
-        for key in kwargs:
-            if key.startswith('tokenizer_'):
-                # Specific to the tokenizer
-                tokenizer_kwargs[key.replace('tokenizer_', '')] = kwargs.pop(key)
-            elif not key.startswith('model_'):
-                # used for both the tokenizer and the model
-                tokenizer_kwargs[key] = kwargs[key]
-
-        model_kwargs = {}
-        for key in kwargs:
-            if key.startswith('model_'):
-                # Specific to the model
-                model_kwargs[key.replace('model_', '')] = kwargs.pop(key)
-            elif not key.startswith('tokenizer_'):
-                # used for both the tokenizer and the model
-                model_kwargs[key] = kwargs[key]
-
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **tokenizer_kwargs)
-        model_kwargs['output_loading_info'] = True
-        if is_tf_available():
-            model, loading_info = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
-        else:
-            model, loading_info = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
-
-        return cls(tokenizer, model, is_trained=bool(not loading_info['missing_keys']))
-
-
-    def save_pretrained(self, save_directory):
-        if not os.path.isdir(save_directory):
-            logger.error("Saving directory ({}) should be a directory".format(save_directory))
-            return
-        self.model.save_pretrained(save_directory)
-        self.tokenizer.save_pretrained(save_directory)
-
-
-    def prepare_data(self, x, y=None,
-                     validation_data=None,
-                     validation_split=0.1, **kwargs):
-        dataset = x
-        if not isinstance(x, SingleSentenceClassificationProcessor):
-            dataset = SingleSentenceClassificationProcessor.create_from_examples(x, y)
-        num_data_samples = len(dataset)
-
-        if validation_data is not None:
-            valid_dataset = validation_data
-            if not isinstance(validation_data, SingleSentenceClassificationProcessor):
-                valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(validation_data)
-
-            num_valid_samples = len(valid_dataset)
-            train_dataset = dataset
-            num_train_samples = num_data_samples
-        else:
-            assert 0.0 <= validation_split <= 1.0, "validation_split should be between 0.0 and 1.0"
-            num_valid_samples = max(int(num_data_samples * validation_split), 1)
-            num_train_samples = num_data_samples - num_valid_samples
-            train_dataset = dataset[num_valid_samples:]
-            valid_dataset = dataset[:num_valid_samples]
-
-        logger.info('Tokenizing and processing dataset')
-        train_dataset = train_dataset.get_features(self.tokenizer,
-                                                   return_tensors='tf' if is_tf_available() else 'pt')
-        valid_dataset = valid_dataset.get_features(self.tokenizer,
-                                                   return_tensors='tf' if is_tf_available() else 'pt')
-        return train_dataset, valid_dataset
-
-
-    def compile(self, learning_rate=3e-5, adam_epsilon=1e-8, **kwargs):
-        if is_tf_available():
-            logger.info('Preparing model')
-            # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
-            opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=adam_epsilon)
-            if USE_AMP:
-                # loss scaling is currently required when using mixed precision
-                opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
-            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-            self.model.compile(optimizer=opt, loss=loss, metrics=[metric])
-        else:
-            raise NotImplementedError
-        self.is_compiled = True
-
-
-    def fit(self, X=None, y=None,
-            validation_data=None,
-            validation_split=0.1,
-            train_batch_size=None,
-            valid_batch_size=None,
-            **kwargs):
-
-        if not self.is_compiled:
-            self.compile(**kwargs)
-
-        train_dataset, valid_dataset = self.prepare_data(X, y=y,
-                                                         validation_data=validation_data,
-                                                         validation_split=validation_split)
-        num_train_samples = len(train_dataset)
-        num_valid_samples = len(valid_dataset)
-
-        train_steps = num_train_samples//train_batch_size
-        valid_steps = num_valid_samples//valid_batch_size
-
-        if is_tf_available():
-            # Prepare dataset as a tf.train_data.Dataset instance
-            train_dataset = train_dataset.shuffle(128).batch(train_batch_size).repeat(-1)
-            valid_dataset = valid_dataset.batch(valid_batch_size)
-
-            logger.info('Training TF 2.0 model')
-            history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
-                                     validation_data=valid_dataset, validation_steps=valid_steps,
-                                     **kwargs)
-        else:
-            raise NotImplementedError
-
-        self.is_trained = True
-
-
-    def fit_transform(self, *texts, **kwargs):
-        # Generic compatibility with sklearn and Keras
-        self.fit(*texts, **kwargs)
-        return self(*texts, **kwargs)
-
-
-    def transform(self, *texts, **kwargs):
-        # Generic compatibility with sklearn and Keras
-        return self(*texts, **kwargs)
-
-
-    def predict(self, *texts, **kwargs):
-        # Generic compatibility with sklearn and Keras
-        return self(*texts, **kwargs)
-
-
-    def __call__(self, *texts, **kwargs):
-        # Generic compatibility with sklearn and Keras
-        if 'X' in kwargs and not texts:
-            texts = kwargs.pop('X')
-
-        if not self.is_trained:
-            logger.error("Some weights of the model are not trained. Please fine-tune the model on a classification task before using it.")
-
-        inputs = self.tokenizer.batch_encode_plus(texts,
-                                                  add_special_tokens=True,
-                                                  return_tensors='tf' if is_tf_available() else 'pt')
-
-        if is_tf_available():
-            # TODO trace model
-            predictions = self.model(**inputs)[0]
-        else:
-            with torch.no_grad():
-                predictions = self.model(**inputs)[0]
-
-        return predictions.numpy().tolist()
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+from abc import ABC, abstractmethod
+from typing import Union, Optional, Tuple
+
+import numpy as np
+
+from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedTokenizer, is_torch_available
+
+if is_tf_available():
+    from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering
+else:
+    from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+
+
+class Pipeline(ABC):
+    def __init__(self, model, tokenizer: PreTrainedTokenizer = None, **kwargs):
+        self.model = model
+        self.tokenizer = tokenizer
+
+    @classmethod
+    @abstractmethod
+    def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs):
+        raise NotImplementedError()
+
+    def save_pretrained(self, save_directory):
+        if not os.path.isdir(save_directory):
+            logger.error("Provided path ({}) should be a directory".format(save_directory))
+            return
+
+        self.model.save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+
+    def transform(self, *texts, **kwargs):
+        # Generic compatibility with sklearn and Keras
+        return self(*texts, **kwargs)
+
+    def predict(self, *texts, **kwargs):
+        # Generic compatibility with sklearn and Keras
+        return self(*texts, **kwargs)
+
+    @abstractmethod
+    def __call__(self, *texts, **kwargs):
+        raise NotImplementedError()
+
+
+class TextClassificationPipeline(Pipeline):
+    def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2):
+        super().__init__(model, tokenizer)
+
+        if nb_classes < 2:
+            raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes))
+        self._nb_classes = nb_classes
+
+    @classmethod
+    def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs):
+        return cls(model, tokenizer, **kwargs)
+
+    def __call__(self, *texts, **kwargs):
+        # Generic compatibility with sklearn and Keras
+        if 'X' in kwargs and not texts:
+            texts = kwargs.pop('X')
+
+        inputs = self.tokenizer.batch_encode_plus(
+            texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
+        )
+
+        special_tokens_mask = inputs.pop('special_tokens_mask')
+
+        if is_tf_available():
+            # TODO trace model
+            predictions = self.model(**inputs)[0]
+        else:
+            import torch
+            with torch.no_grad():
+                predictions = self.model(**inputs)[0]
+
+        return predictions.numpy().tolist()
+
+
+class QuestionAnsweringPipeline(Pipeline):
+
+    @classmethod
+    def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs):
+        pass
+
+    def __call__(self, texts, **kwargs):
+        # Generic compatibility with sklearn and Keras
+        if 'X' in kwargs and not texts:
+            texts = kwargs.pop('X')
+
+        if not isinstance(texts, (tuple, list)):
+            raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.')
+
+        if not isinstance(texts, list):
+            texts = [texts]
+
+        inputs = self.tokenizer.batch_encode_plus(
+            texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
+        )
+
+        # Remove special_tokens_mask to avoid KeyError
+        _ = inputs.pop('special_tokens_mask')
+
+        if is_tf_available():
+            # TODO trace model
+            start, end = self.model(inputs)
+        else:
+            import torch
+            with torch.no_grad():
+                # Retrieve the score for the context tokens only (removing question tokens)
+                start, end = self.model(**inputs)
+                start, end = start.cpu().numpy(), end.cpu().numpy()
+
+        answers = []
+        for i in range(len(texts)):
+            context_idx = inputs['token_type_ids'][i] == 1
+            start_, end_ = start[i, context_idx], end[i, context_idx]
+
+            # Normalize logits and spans to retrieve the answer
+            start_, end_ = self.decode(start_, end_)
+
+            # Convert the answer (tokens) back to the original text
+            answers += [{
+                'start': start_,
+                'end': end_,
+                'answer': self.span_to_answer(texts[i][1], start_, end_)
+            }]
+
+        return answers
+
+    def decode(self, start: np.ndarray, end: np.ndarray) -> Tuple:
+        # Ensure we have batch axis
+        if start.ndim == 1:
+            start = start[None]
+
+        if end.ndim == 1:
+            end = end[None]
+
+        # Compute the score of each tuple(start, end) to be the real answer
+        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+        # Remove candidate with end < start and end - start > 15
+        candidates = np.tril(np.triu(outer), 15)
+
+        start = np.max(candidates, axis=2).argmax(-1)
+        end = np.max(candidates, axis=1).argmax(-1)
+
+        return start, end
+
+    def span_to_answer(self, text: str, start: int, end: int):
+        words, token_idx = [], 0
+
+        for i, word in enumerate(text.split(" ")):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+
+        # Join text with spaces
+        return ' '.join(words)
+
+
+# Register all the supported task here
+SUPPORTED_TASKS = {
+    'text-classification': {
+        'impl': TextClassificationPipeline,
+        'tf': TFAutoModelForSequenceClassification if is_tf_available() else None,
+        'pt': AutoModelForSequenceClassification if is_torch_available() else None
+    },
+    'question-answering': {
+        'impl': QuestionAnsweringPipeline,
+        'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
+        'pt': AutoModelForQuestionAnswering if is_torch_available() else None
+    }
+}
+
+
+def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline:
+    """
+    Utility factory method to build pipeline.
+    """
+    # Try to infer tokenizer from model name (if provided as str)
+    if tokenizer is None and isinstance(model, str):
+        tokenizer = model
+    else:
+        # Impossible to guest what is the right tokenizer here
+        raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance')
+
+    tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer)
+
+    if task not in SUPPORTED_TASKS:
+        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
+
+    targeted_task = SUPPORTED_TASKS[task]
+    task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
+
+    model = allocator.from_pretrained(model)
+    return task(model, tokenizer, **kwargs)