Unverified Commit 2e12d90b authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Fixing Dataset for TQA + token-classification. (#14658)

* Fixing Dataset for TQA + token-classification.

* Fixing the tests.

* Making sure `offset_mappings` is a valid argument.
parent fae0b9fa
import collections import collections
import types
import numpy as np import numpy as np
...@@ -9,7 +10,7 @@ from ..file_utils import ( ...@@ -9,7 +10,7 @@ from ..file_utils import (
is_torch_available, is_torch_available,
requires_backends, requires_backends,
) )
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline, PipelineException
if is_torch_available(): if is_torch_available():
...@@ -58,6 +59,8 @@ class TableQuestionAnsweringArgumentHandler(ArgumentHandler): ...@@ -58,6 +59,8 @@ class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` " f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` "
f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys." f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
) )
elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
return table
else: else:
raise ValueError( raise ValueError(
f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but " f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
......
import types
import warnings import warnings
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -5,7 +6,7 @@ import numpy as np ...@@ -5,7 +6,7 @@ import numpy as np
from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
from ..models.bert.tokenization_bert import BasicTokenizer from ..models.bert.tokenization_bert import BasicTokenizer
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline
if is_tf_available(): if is_tf_available():
...@@ -28,6 +29,8 @@ class TokenClassificationArgumentHandler(ArgumentHandler): ...@@ -28,6 +29,8 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
elif isinstance(inputs, str): elif isinstance(inputs, str):
inputs = [inputs] inputs = [inputs]
batch_size = 1 batch_size = 1
elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
return inputs, None
else: else:
raise ValueError("At least one input is required.") raise ValueError("At least one input is required.")
...@@ -112,8 +115,13 @@ class TokenClassificationPipeline(Pipeline): ...@@ -112,8 +115,13 @@ class TokenClassificationPipeline(Pipeline):
grouped_entities: Optional[bool] = None, grouped_entities: Optional[bool] = None,
ignore_subwords: Optional[bool] = None, ignore_subwords: Optional[bool] = None,
aggregation_strategy: Optional[AggregationStrategy] = None, aggregation_strategy: Optional[AggregationStrategy] = None,
offset_mapping: Optional[List[Tuple[int, int]]] = None,
): ):
preprocess_params = {}
if offset_mapping is not None:
preprocess_params["offset_mapping"] = offset_mapping
postprocess_params = {} postprocess_params = {}
if grouped_entities is not None or ignore_subwords is not None: if grouped_entities is not None or ignore_subwords is not None:
if grouped_entities and ignore_subwords: if grouped_entities and ignore_subwords:
...@@ -147,7 +155,7 @@ class TokenClassificationPipeline(Pipeline): ...@@ -147,7 +155,7 @@ class TokenClassificationPipeline(Pipeline):
postprocess_params["aggregation_strategy"] = aggregation_strategy postprocess_params["aggregation_strategy"] = aggregation_strategy
if ignore_labels is not None: if ignore_labels is not None:
postprocess_params["ignore_labels"] = ignore_labels postprocess_params["ignore_labels"] = ignore_labels
return {}, {}, postprocess_params return preprocess_params, {}, postprocess_params
def __call__(self, inputs: Union[str, List[str]], **kwargs): def __call__(self, inputs: Union[str, List[str]], **kwargs):
""" """
...@@ -174,12 +182,13 @@ class TokenClassificationPipeline(Pipeline): ...@@ -174,12 +182,13 @@ class TokenClassificationPipeline(Pipeline):
Only exists if the offsets are available within the tokenizer Only exists if the offsets are available within the tokenizer
""" """
_inputs, offset_mappings = self._args_parser(inputs, **kwargs) _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
self.offset_mappings = offset_mappings if offset_mapping:
kwargs["offset_mapping"] = offset_mapping
return super().__call__(inputs, **kwargs) return super().__call__(inputs, **kwargs)
def preprocess(self, sentence): def preprocess(self, sentence, offset_mapping=None):
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
model_inputs = self.tokenizer( model_inputs = self.tokenizer(
sentence, sentence,
...@@ -189,8 +198,7 @@ class TokenClassificationPipeline(Pipeline): ...@@ -189,8 +198,7 @@ class TokenClassificationPipeline(Pipeline):
return_special_tokens_mask=True, return_special_tokens_mask=True,
return_offsets_mapping=self.tokenizer.is_fast, return_offsets_mapping=self.tokenizer.is_fast,
) )
if self.offset_mappings: if offset_mapping:
offset_mapping = self.offset_mappings[0]
model_inputs["offset_mapping"] = offset_mapping model_inputs["offset_mapping"] = offset_mapping
model_inputs["sentence"] = sentence model_inputs["sentence"] = sentence
...@@ -262,6 +270,7 @@ class TokenClassificationPipeline(Pipeline): ...@@ -262,6 +270,7 @@ class TokenClassificationPipeline(Pipeline):
word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
if offset_mapping is not None: if offset_mapping is not None:
start_ind, end_ind = offset_mapping[idx] start_ind, end_ind = offset_mapping[idx]
if not isinstance(start_ind, int):
if self.framework == "pt": if self.framework == "pt":
start_ind = start_ind.item() start_ind = start_ind.item()
end_ind = end_ind.item() end_ind = end_ind.item()
......
...@@ -183,9 +183,12 @@ class PipelineTestCaseMeta(type): ...@@ -183,9 +183,12 @@ class PipelineTestCaseMeta(type):
# 10 examples with batch size 4 means there needs to be a unfinished batch # 10 examples with batch size 4 means there needs to be a unfinished batch
# which is important for the unbatcher # which is important for the unbatcher
dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)] def data(n):
for _ in range(n):
# Need to copy because Conversation object is mutated
yield copy.deepcopy(random.choice(examples))
for item in pipeline(dataset, batch_size=4): for item in pipeline(data(10), batch_size=4):
pass pass
run_batch_test(pipeline, examples) run_batch_test(pipeline, examples)
......
...@@ -35,17 +35,16 @@ from transformers.testing_utils import ( ...@@ -35,17 +35,16 @@ from transformers.testing_utils import (
from .test_pipelines_common import PipelineTestCaseMeta from .test_pipelines_common import PipelineTestCaseMeta
@require_tensorflow_probability
@require_torch_scatter
@require_torch
@require_pandas
@is_pipeline_test @is_pipeline_test
class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
# Putting it there for consistency, but TQA do not have fast tokenizer # Putting it there for consistency, but TQA do not have fast tokenizer
# which are needed to generate automatic tests # which are needed to generate automatic tests
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
@require_tensorflow_probability
@require_pandas
@require_tf @require_tf
@require_torch
def test_small_model_tf(self): def test_small_model_tf(self):
model_id = "lysandre/tiny-tapas-random-wtq" model_id = "lysandre/tiny-tapas-random-wtq"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True) model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
...@@ -147,6 +146,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -147,6 +146,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
) )
@require_torch @require_torch
@require_torch_scatter
def test_small_model_pt(self): def test_small_model_pt(self):
model_id = "lysandre/tiny-tapas-random-wtq" model_id = "lysandre/tiny-tapas-random-wtq"
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id) model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
...@@ -248,6 +248,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -248,6 +248,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
) )
@require_torch @require_torch
@require_torch_scatter
def test_slow_tokenizer_sqa_pt(self): def test_slow_tokenizer_sqa_pt(self):
model_id = "lysandre/tiny-tapas-random-sqa" model_id = "lysandre/tiny-tapas-random-sqa"
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id) model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
...@@ -366,6 +367,9 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -366,6 +367,9 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
) )
@require_tf @require_tf
@require_tensorflow_probability
@require_pandas
@require_torch
def test_slow_tokenizer_sqa_tf(self): def test_slow_tokenizer_sqa_tf(self):
model_id = "lysandre/tiny-tapas-random-sqa" model_id = "lysandre/tiny-tapas-random-sqa"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True) model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
...@@ -484,6 +488,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -484,6 +488,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
) )
@slow @slow
@require_torch_scatter
def test_integration_wtq_pt(self): def test_integration_wtq_pt(self):
table_querier = pipeline("table-question-answering") table_querier = pipeline("table-question-answering")
...@@ -528,6 +533,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -528,6 +533,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.assertListEqual(results, expected_results) self.assertListEqual(results, expected_results)
@slow @slow
@require_tensorflow_probability
@require_pandas
def test_integration_wtq_tf(self): def test_integration_wtq_tf(self):
model_id = "google/tapas-base-finetuned-wtq" model_id = "google/tapas-base-finetuned-wtq"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id) model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
...@@ -575,6 +582,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -575,6 +582,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.assertListEqual(results, expected_results) self.assertListEqual(results, expected_results)
@slow @slow
@require_torch_scatter
def test_integration_sqa_pt(self): def test_integration_sqa_pt(self):
table_querier = pipeline( table_querier = pipeline(
"table-question-answering", "table-question-answering",
...@@ -598,6 +606,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -598,6 +606,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.assertListEqual(results, expected_results) self.assertListEqual(results, expected_results)
@slow @slow
@require_tensorflow_probability
@require_pandas
def test_integration_sqa_tf(self): def test_integration_sqa_tf(self):
model_id = "google/tapas-base-finetuned-sqa" model_id = "google/tapas-base-finetuned-sqa"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id) model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
......
...@@ -636,6 +636,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ...@@ -636,6 +636,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
[], [],
) )
token_classifier = pipeline(task="token-classification", model=model_name, framework="pt")
# Overload offset_mapping
outputs = token_classifier(
"This is a test !", offset_mapping=[(0, 0), (0, 1), (0, 2), (0, 0), (0, 0), (0, 0), (0, 0)]
)
self.assertEqual(
nested_simplify(outputs),
[
{"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 1},
{"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 0, "end": 2},
],
)
@require_torch @require_torch
def test_pt_ignore_subwords_slow_tokenizer_raises(self): def test_pt_ignore_subwords_slow_tokenizer_raises(self):
model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english" model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment