Commit 02110485 authored by Morgan Funtowicz's avatar Morgan Funtowicz
Browse files

Added batching, topk, chars index and scores.

parent e1d89cb2
......@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os
from abc import ABC, abstractmethod
from typing import Union, Optional, Tuple
from typing import Union, Optional, Tuple, List, Dict
import numpy as np
......@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
if is_tf_available():
from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering
else:
if is_torch_available():
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering
......@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
class QuestionAnsweringPipeline(Pipeline):
"""
Question Answering pipeling involving Tokenization and Inference.
TODO:
- top-k answers
- return start/end chars
- return score
"""
def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]):
super().__init__(model, tokenizer)
@staticmethod
def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[dict, List[Dict]]:
is_list = isinstance(question, list)
if is_list:
return [{'question': q, 'context': c} for q, c in zip(question, context)]
else:
return {'question': question, 'context': context}
@classmethod
def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs):
pass
def __call__(self, texts, **kwargs):
def __call__(self, *texts, **kwargs):
# Set defaults values
kwargs.setdefault('max_answer_len', 15)
kwargs.setdefault('topk', 1)
if kwargs['topk'] < 1:
raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
if kwargs['max_answer_len'] < 1:
raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
# Tabular input
if 'question' in kwargs and 'context' in kwargs:
texts = QuestionAnsweringPipeline.create_sample(kwargs['questions'], kwargs['contexts'])
elif 'data' in kwargs:
texts = kwargs['data']
# Generic compatibility with sklearn and Keras
if 'X' in kwargs and not texts:
elif 'X' in kwargs and not texts:
texts = kwargs.pop('X')
else:
(texts, ) = texts
if not isinstance(texts, (tuple, list)):
raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.')
if not isinstance(texts, (dict, list)):
raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of dict.')
if not isinstance(texts, list):
texts = [texts]
# Map to tuple (question, context)
texts = [(text['question'], text['context']) for text in texts]
inputs = self.tokenizer.batch_encode_plus(
texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
# texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
texts, add_special_tokens=True, return_tensors='pt'
)
# Remove special_tokens_mask to avoid KeyError
_ = inputs.pop('special_tokens_mask')
if is_tf_available():
# if is_tf_available():
if False:
# TODO trace model
start, end = self.model(inputs)
else:
......@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
start_, end_ = start[i, context_idx], end[i, context_idx]
# Normalize logits and spans to retrieve the answer
start_, end_ = self.decode(start_, end_)
start_ = np.exp(start_) / np.sum(np.exp(start_))
end_ = np.exp(end_) / np.sum(np.exp(end_))
starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
# Convert the answer (tokens) back to the original text
answers += [{
'start': start_,
'end': end_,
'answer': self.span_to_answer(texts[i][1], start_, end_)
}]
answers += [[
{**{'score': score}, **self.span_to_answer(texts[i][1], s, e)}
for s, e, score in zip(starts, ends, scores)
]]
return answers
def decode(self, start: np.ndarray, end: np.ndarray) -> Tuple:
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
# Ensure we have batch axis
if start.ndim == 1:
start = start[None]
......@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
# Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
# Remove candidate with end < start and end - start > 15
candidates = np.tril(np.triu(outer), 15)
# Remove candidate with end < start and end - start > max_answer_len
candidates = np.tril(np.triu(outer), max_answer_len - 1)
# start = np.max(candidates, axis=2).argmax(-1)
# end = np.max(candidates, axis=1).argmax(-1)
start = np.max(candidates, axis=2).argmax(-1)
end = np.max(candidates, axis=1).argmax(-1)
scores_flat = candidates.flatten()
if topk == 1:
idx_sort = [np.argmax(scores_flat)]
elif len(scores_flat) < topk:
idx_sort = np.argsort(-scores_flat)
else:
idx = np.argpartition(-scores_flat, topk)[0:topk]
idx_sort = idx[np.argsort(-scores_flat[idx])]
return start, end
start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
return start, end, candidates[0, start, end]
def span_to_answer(self, text: str, start: int, end: int):
words, token_idx = [], 0
words = []
token_idx = char_start_idx = char_end_idx = chars_idx = 0
for i, word in enumerate(text.split(" ")):
token = self.tokenizer.tokenize(word)
# Append words if they are in the span
if start <= token_idx <= end:
if token_idx == start:
char_start_idx = chars_idx
if token_idx == end:
char_end_idx = chars_idx + len(word)
words += [word]
# Stop if we went over the end of the answer
......@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
# Append the subtokenization length to the running index
token_idx += len(token)
chars_idx += len(word) + 1
# Join text with spaces
return ' '.join(words)
return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
# Register all the supported task here
......@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
},
'question-answering': {
'impl': QuestionAnsweringPipeline,
'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
# 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
'pt': AutoModelForQuestionAnswering if is_torch_available() else None
}
}
......@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
targeted_task = SUPPORTED_TASKS[task]
task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
# task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
task, allocator = targeted_task['impl'], targeted_task['pt']
model = allocator.from_pretrained(model)
return task(model, tokenizer, **kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment