Added batching, topk, chars index and scores.

02110485 · Morgan Funtowicz · e1d89cb2 · 02110485
Commit 02110485 authored Dec 06, 2019 by Morgan Funtowicz
Hide whitespace changes
Inline Side-by-side

Showing with 86 additions and 24 deletions

transformers/pipelines.py transformers/pipelines.py +86 -24

No files found.
--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 from abc import ABC, abstractmethod
-from typing import Union, Optional, Tuple
+from typing import Union, Optional, Tuple, List, Dict
 import numpy as np
@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
 if is_tf_available():
    from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering
-else:
+if is_torch_available():
    from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering
@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
 class QuestionAnsweringPipeline(Pipeline):
+    """
+    Question Answering pipeling involving Tokenization and Inference.
+    TODO:
+     - top-k answers
+     - return start/end chars
+     - return score
+    """
+    def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]):
+        super().__init__(model, tokenizer)
+    @staticmethod
+    def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[dict, List[Dict]]:
+        is_list = isinstance(question, list)
+        if is_list:
+            return [{'question': q, 'context': c} for q, c in zip(question, context)]
+        else:
+            return {'question': question, 'context': context}
    @classmethod
    def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs):
        pass
-    def __call__(self, texts, **kwargs):
+    def __call__(self, *texts, **kwargs):
+        # Set defaults values
+        kwargs.setdefault('max_answer_len', 15)
+        kwargs.setdefault('topk', 1)
+        if kwargs['topk'] < 1:
+            raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
+        if kwargs['max_answer_len'] < 1:
+            raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
+        # Tabular input
+        if 'question' in kwargs and 'context' in kwargs:
+            texts = QuestionAnsweringPipeline.create_sample(kwargs['questions'], kwargs['contexts'])
+        elif 'data' in kwargs:
+            texts = kwargs['data']
        # Generic compatibility with sklearn and Keras
-        if 'X' in kwargs and not texts:
+        elif 'X' in kwargs and not texts:
            texts = kwargs.pop('X')
+        else:
+            (texts, ) = texts
-        if not isinstance(texts, (tuple, list)):
+        if not isinstance(texts, (dict, list)):
-            raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.')
+            raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of dict.')
        if not isinstance(texts, list):
            texts = [texts]
+        # Map to tuple (question, context)
+        texts = [(text['question'], text['context']) for text in texts]
        inputs = self.tokenizer.batch_encode_plus(
-            texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
+            # texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
+            texts, add_special_tokens=True, return_tensors='pt'
        )
        # Remove special_tokens_mask to avoid KeyError
        _ = inputs.pop('special_tokens_mask')
-        if is_tf_available():
+        # if is_tf_available():
+        if False:
            # TODO trace model
            start, end = self.model(inputs)
        else:
@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
            start_, end_ = start[i, context_idx], end[i, context_idx]
            # Normalize logits and spans to retrieve the answer
-            start_, end_ = self.decode(start_, end_)
+            start_ = np.exp(start_) / np.sum(np.exp(start_))
+            end_ = np.exp(end_) / np.sum(np.exp(end_))
+            starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
            # Convert the answer (tokens) back to the original text
-            answers += [{
+            answers += [[
-                'start': start_,
+                {**{'score': score}, **self.span_to_answer(texts[i][1], s, e)}
-                'end': end_,
+                for s, e, score in zip(starts, ends, scores)
-                'answer': self.span_to_answer(texts[i][1], start_, end_)
+            ]]
-            }]
        return answers
-    def decode(self, start: np.ndarray, end: np.ndarray) -> Tuple:
+    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]
@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
-        # Remove candidate with end < start and end - start > 15
+        # Remove candidate with end < start and end - start > max_answer_len
-        candidates = np.tril(np.triu(outer), 15)
+        candidates = np.tril(np.triu(outer), max_answer_len - 1)
+        # start = np.max(candidates, axis=2).argmax(-1)
+        # end = np.max(candidates, axis=1).argmax(-1)
-        start = np.max(candidates, axis=2).argmax(-1)
+        scores_flat = candidates.flatten()
-        end = np.max(candidates, axis=1).argmax(-1)
+        if topk == 1:
+            idx_sort = [np.argmax(scores_flat)]
+        elif len(scores_flat) < topk:
+            idx_sort = np.argsort(-scores_flat)
+        else:
+            idx = np.argpartition(-scores_flat, topk)[0:topk]
+            idx_sort = idx[np.argsort(-scores_flat[idx])]
-        return start, end
+        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
+        return start, end, candidates[0, start, end]
    def span_to_answer(self, text: str, start: int, end: int):
-        words, token_idx = [], 0
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
        for i, word in enumerate(text.split(" ")):
            token = self.tokenizer.tokenize(word)
            # Append words if they are in the span
            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
                words += [word]
            # Stop if we went over the end of the answer
@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
            # Append the subtokenization length to the running index
            token_idx += len(token)
+            chars_idx += len(word) + 1
        # Join text with spaces
-        return ' '.join(words)
+        return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
 # Register all the supported task here
@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
    },
    'question-answering': {
        'impl': QuestionAnsweringPipeline,
-        'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
+        # 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
        'pt': AutoModelForQuestionAnswering if is_torch_available() else None
    }
 }
@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
    targeted_task = SUPPORTED_TASKS[task]
-    task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
+    # task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
+    task, allocator = targeted_task['impl'], targeted_task['pt']
    model = allocator.from_pretrained(model)
    return task(model, tokenizer, **kwargs)