Commit 02110485 authored by Morgan Funtowicz's avatar Morgan Funtowicz
Browse files

Added batching, topk, chars index and scores.

parent e1d89cb2
...@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Union, Optional, Tuple from typing import Union, Optional, Tuple, List, Dict
import numpy as np import numpy as np
...@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken ...@@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken
if is_tf_available(): if is_tf_available():
from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering
else:
if is_torch_available():
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering
...@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline): ...@@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline):
class QuestionAnsweringPipeline(Pipeline): class QuestionAnsweringPipeline(Pipeline):
"""
Question Answering pipeling involving Tokenization and Inference.
TODO:
- top-k answers
- return start/end chars
- return score
"""
def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]):
super().__init__(model, tokenizer)
@staticmethod
def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[dict, List[Dict]]:
is_list = isinstance(question, list)
if is_list:
return [{'question': q, 'context': c} for q, c in zip(question, context)]
else:
return {'question': question, 'context': context}
@classmethod @classmethod
def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs):
pass pass
def __call__(self, texts, **kwargs): def __call__(self, *texts, **kwargs):
# Set defaults values
kwargs.setdefault('max_answer_len', 15)
kwargs.setdefault('topk', 1)
if kwargs['topk'] < 1:
raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
if kwargs['max_answer_len'] < 1:
raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
# Tabular input
if 'question' in kwargs and 'context' in kwargs:
texts = QuestionAnsweringPipeline.create_sample(kwargs['questions'], kwargs['contexts'])
elif 'data' in kwargs:
texts = kwargs['data']
# Generic compatibility with sklearn and Keras # Generic compatibility with sklearn and Keras
if 'X' in kwargs and not texts: elif 'X' in kwargs and not texts:
texts = kwargs.pop('X') texts = kwargs.pop('X')
else:
(texts, ) = texts
if not isinstance(texts, (tuple, list)): if not isinstance(texts, (dict, list)):
raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.') raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of dict.')
if not isinstance(texts, list): if not isinstance(texts, list):
texts = [texts] texts = [texts]
# Map to tuple (question, context)
texts = [(text['question'], text['context']) for text in texts]
inputs = self.tokenizer.batch_encode_plus( inputs = self.tokenizer.batch_encode_plus(
texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' # texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt'
texts, add_special_tokens=True, return_tensors='pt'
) )
# Remove special_tokens_mask to avoid KeyError # Remove special_tokens_mask to avoid KeyError
_ = inputs.pop('special_tokens_mask') _ = inputs.pop('special_tokens_mask')
if is_tf_available(): # if is_tf_available():
if False:
# TODO trace model # TODO trace model
start, end = self.model(inputs) start, end = self.model(inputs)
else: else:
...@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline):
start_, end_ = start[i, context_idx], end[i, context_idx] start_, end_ = start[i, context_idx], end[i, context_idx]
# Normalize logits and spans to retrieve the answer # Normalize logits and spans to retrieve the answer
start_, end_ = self.decode(start_, end_) start_ = np.exp(start_) / np.sum(np.exp(start_))
end_ = np.exp(end_) / np.sum(np.exp(end_))
starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
# Convert the answer (tokens) back to the original text # Convert the answer (tokens) back to the original text
answers += [{ answers += [[
'start': start_, {**{'score': score}, **self.span_to_answer(texts[i][1], s, e)}
'end': end_, for s, e, score in zip(starts, ends, scores)
'answer': self.span_to_answer(texts[i][1], start_, end_) ]]
}]
return answers return answers
def decode(self, start: np.ndarray, end: np.ndarray) -> Tuple: def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
# Ensure we have batch axis # Ensure we have batch axis
if start.ndim == 1: if start.ndim == 1:
start = start[None] start = start[None]
...@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline):
# Compute the score of each tuple(start, end) to be the real answer # Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
# Remove candidate with end < start and end - start > 15 # Remove candidate with end < start and end - start > max_answer_len
candidates = np.tril(np.triu(outer), 15) candidates = np.tril(np.triu(outer), max_answer_len - 1)
# start = np.max(candidates, axis=2).argmax(-1)
# end = np.max(candidates, axis=1).argmax(-1)
start = np.max(candidates, axis=2).argmax(-1) scores_flat = candidates.flatten()
end = np.max(candidates, axis=1).argmax(-1) if topk == 1:
idx_sort = [np.argmax(scores_flat)]
elif len(scores_flat) < topk:
idx_sort = np.argsort(-scores_flat)
else:
idx = np.argpartition(-scores_flat, topk)[0:topk]
idx_sort = idx[np.argsort(-scores_flat[idx])]
return start, end start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
return start, end, candidates[0, start, end]
def span_to_answer(self, text: str, start: int, end: int): def span_to_answer(self, text: str, start: int, end: int):
words, token_idx = [], 0 words = []
token_idx = char_start_idx = char_end_idx = chars_idx = 0
for i, word in enumerate(text.split(" ")): for i, word in enumerate(text.split(" ")):
token = self.tokenizer.tokenize(word) token = self.tokenizer.tokenize(word)
# Append words if they are in the span # Append words if they are in the span
if start <= token_idx <= end: if start <= token_idx <= end:
if token_idx == start:
char_start_idx = chars_idx
if token_idx == end:
char_end_idx = chars_idx + len(word)
words += [word] words += [word]
# Stop if we went over the end of the answer # Stop if we went over the end of the answer
...@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline):
# Append the subtokenization length to the running index # Append the subtokenization length to the running index
token_idx += len(token) token_idx += len(token)
chars_idx += len(word) + 1
# Join text with spaces # Join text with spaces
return ' '.join(words) return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
# Register all the supported task here # Register all the supported task here
...@@ -193,7 +254,7 @@ SUPPORTED_TASKS = { ...@@ -193,7 +254,7 @@ SUPPORTED_TASKS = {
}, },
'question-answering': { 'question-answering': {
'impl': QuestionAnsweringPipeline, 'impl': QuestionAnsweringPipeline,
'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, # 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
'pt': AutoModelForQuestionAnswering if is_torch_available() else None 'pt': AutoModelForQuestionAnswering if is_torch_available() else None
} }
} }
...@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize ...@@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize
raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
targeted_task = SUPPORTED_TASKS[task] targeted_task = SUPPORTED_TASKS[task]
task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] # task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt']
task, allocator = targeted_task['impl'], targeted_task['pt']
model = allocator.from_pretrained(model) model = allocator.from_pretrained(model)
return task(model, tokenizer, **kwargs) return task(model, tokenizer, **kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment