Unverified Commit 29b536a7 authored by Ceyda Cinarel's avatar Ceyda Cinarel Committed by GitHub
Browse files

[WIP] Ner pipeline grouped_entities fixes (#5970)



* Bug fix: NER pipeline shouldn't group separate entities of same type

* style fix

* [Bug Fix] Shouldn't group entities that are both 'B' even if they are same type
	(B-type1 B-type1) != (B-type1 I-type1)
[Bug Fix] add an option `ignore_subwords` to ignore subsequent ##wordpieces in predictions. Because some models train on only the first token of a word and not on the subsequent wordpieces (BERT NER default). So it makes sense doing the same thing at inference time.
	The simplest fix is to just group the subwords with the first wordpiece.
	[TODO] how to handle ignored scores? just set them to 0 and calculate zero invariant mean ?
	[TODO] handle different wordpiece_prefix ## ? possible approaches:
		get it from tokenizer? but currently most tokenizers dont have a wordpiece_prefix property?
		have an _is_subword(token)
[Feature add] added option to `skip_special_tokens`. Cause It was harder to remove them after grouping.
[Additional Changes] remove B/I prefix on returned grouped_entities
[Feature Request/TODO] Return indexes?
[Bug TODO]  can't use fast tokenizer with grouped_entities ('BertTokenizerFast' object has no attribute 'convert_tokens_to_string')

* use offset_mapping to fix [UNK] token problem

* ignore score for subwords

* modify ner_pipeline test

* modify ner_pipeline test

* modify ner_pipeline test

* ner_pipeline change ignore_subwords default to true

* add ner_pipeline ignore_subword=False test case

* fix offset_mapping index

* fix style again duh

* change is_subword and convert_tokens_to_string logic

* merge tests with new test structure

* change test names

* remove old tests

* ner tests for fast tokenizer

* fast tokenizers have convert_tokens_to_string

* Fix the incorrect merge
Co-authored-by: default avatarCeyda Cinarel <snu-ceyda@users.noreply.github.com>
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarLysandre <lysandre.debut@reseau.eseo.fr>
parent 1bb4bba5
...@@ -1324,6 +1324,29 @@ class FillMaskPipeline(Pipeline): ...@@ -1324,6 +1324,29 @@ class FillMaskPipeline(Pipeline):
return results return results
class TokenClassificationArgumentHandler(ArgumentHandler):
"""
Handles arguments for token classification.
"""
def __call__(self, *args, **kwargs):
if args is not None and len(args) > 0:
if isinstance(args, str):
inputs = [args]
else:
inputs = args
batch_size = len(inputs)
offset_mapping = kwargs.get("offset_mapping", None)
if offset_mapping:
if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
offset_mapping = [offset_mapping]
if len(offset_mapping) != batch_size:
raise ("offset_mapping should have the same batch size as the input")
return inputs, offset_mapping
@add_end_docstrings( @add_end_docstrings(
PIPELINE_INIT_ARGS, PIPELINE_INIT_ARGS,
r""" r"""
...@@ -1361,13 +1384,14 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1361,13 +1384,14 @@ class TokenClassificationPipeline(Pipeline):
ignore_labels=["O"], ignore_labels=["O"],
task: str = "", task: str = "",
grouped_entities: bool = False, grouped_entities: bool = False,
ignore_subwords: bool = True,
): ):
super().__init__( super().__init__(
model=model, model=model,
tokenizer=tokenizer, tokenizer=tokenizer,
modelcard=modelcard, modelcard=modelcard,
framework=framework, framework=framework,
args_parser=args_parser, args_parser=TokenClassificationArgumentHandler(),
device=device, device=device,
binary_output=binary_output, binary_output=binary_output,
task=task, task=task,
...@@ -1382,6 +1406,7 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1382,6 +1406,7 @@ class TokenClassificationPipeline(Pipeline):
self._basic_tokenizer = BasicTokenizer(do_lower_case=False) self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
self.ignore_labels = ignore_labels self.ignore_labels = ignore_labels
self.grouped_entities = grouped_entities self.grouped_entities = grouped_entities
self.ignore_subwords = ignore_subwords
def __call__(self, inputs: Union[str, List[str]], **kwargs): def __call__(self, inputs: Union[str, List[str]], **kwargs):
""" """
...@@ -1402,10 +1427,15 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1402,10 +1427,15 @@ class TokenClassificationPipeline(Pipeline):
- **index** (:obj:`int`, only present when ``self.grouped_entities=False``) -- The index of the - **index** (:obj:`int`, only present when ``self.grouped_entities=False``) -- The index of the
corresponding token in the sentence. corresponding token in the sentence.
""" """
if isinstance(inputs, str): if isinstance(inputs, str):
inputs = [inputs] inputs = [inputs]
offset_mappings = kwargs.get("offset_mappings")
answers = [] answers = []
for sentence in inputs:
for i, sentence in enumerate(inputs):
# Manage correct placement of the tensors # Manage correct placement of the tensors
with self.device_placement(): with self.device_placement():
...@@ -1415,7 +1445,18 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1415,7 +1445,18 @@ class TokenClassificationPipeline(Pipeline):
return_attention_mask=False, return_attention_mask=False,
return_tensors=self.framework, return_tensors=self.framework,
truncation=True, truncation=True,
return_special_tokens_mask=True,
return_offsets_mapping=self.tokenizer.is_fast,
) )
if self.tokenizer.is_fast:
offset_mapping = tokens["offset_mapping"].cpu().numpy()[0]
del tokens["offset_mapping"]
elif offset_mappings:
offset_mapping = offset_mappings[i]
else:
raise Exception("To decode [UNK] tokens use a fast tokenizer or provide offset_mapping parameter")
special_tokens_mask = tokens["special_tokens_mask"].cpu().numpy()[0]
del tokens["special_tokens_mask"]
# Forward # Forward
if self.framework == "tf": if self.framework == "tf":
...@@ -1432,24 +1473,35 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1432,24 +1473,35 @@ class TokenClassificationPipeline(Pipeline):
entities = [] entities = []
# Filter to labels not in `self.ignore_labels` # Filter to labels not in `self.ignore_labels`
# Filter special_tokens
filtered_labels_idx = [ filtered_labels_idx = [
(idx, label_idx) (idx, label_idx)
for idx, label_idx in enumerate(labels_idx) for idx, label_idx in enumerate(labels_idx)
if self.model.config.id2label[label_idx] not in self.ignore_labels if (self.model.config.id2label[label_idx] not in self.ignore_labels) and not special_tokens_mask[idx]
] ]
for idx, label_idx in filtered_labels_idx: for idx, label_idx in filtered_labels_idx:
start_ind, end_ind = offset_mapping[idx]
word_ref = sentence[start_ind:end_ind]
word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
is_subword = len(word_ref) != len(word)
if int(input_ids[idx]) == self.tokenizer.unk_token_id:
word = word_ref
is_subword = False
entity = { entity = {
"word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])), "word": word,
"score": score[idx][label_idx].item(), "score": score[idx][label_idx].item(),
"entity": self.model.config.id2label[label_idx], "entity": self.model.config.id2label[label_idx],
"index": idx, "index": idx,
} }
if self.grouped_entities and self.ignore_subwords:
entity["is_subword"] = is_subword
entities += [entity] entities += [entity]
# Append grouped entities
if self.grouped_entities: if self.grouped_entities:
answers += [self.group_entities(entities)] answers += [self.group_entities(entities)]
# Append ungrouped entities # Append ungrouped entities
...@@ -1468,8 +1520,8 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1468,8 +1520,8 @@ class TokenClassificationPipeline(Pipeline):
entities (:obj:`dict`): The entities predicted by the pipeline. entities (:obj:`dict`): The entities predicted by the pipeline.
""" """
# Get the first entity in the entity group # Get the first entity in the entity group
entity = entities[0]["entity"] entity = entities[0]["entity"].split("-")[-1]
scores = np.mean([entity["score"] for entity in entities]) scores = np.nanmean([entity["score"] for entity in entities])
tokens = [entity["word"] for entity in entities] tokens = [entity["word"] for entity in entities]
entity_group = { entity_group = {
...@@ -1494,7 +1546,9 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1494,7 +1546,9 @@ class TokenClassificationPipeline(Pipeline):
last_idx = entities[-1]["index"] last_idx = entities[-1]["index"]
for entity in entities: for entity in entities:
is_last_idx = entity["index"] == last_idx is_last_idx = entity["index"] == last_idx
is_subword = self.ignore_subwords and entity["is_subword"]
if not entity_group_disagg: if not entity_group_disagg:
entity_group_disagg += [entity] entity_group_disagg += [entity]
if is_last_idx: if is_last_idx:
...@@ -1503,10 +1557,19 @@ class TokenClassificationPipeline(Pipeline): ...@@ -1503,10 +1557,19 @@ class TokenClassificationPipeline(Pipeline):
# If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
# The split is meant to account for the "B" and "I" suffixes # The split is meant to account for the "B" and "I" suffixes
# Shouldn't merge if both entities are B-type
if ( if (
entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1] (
entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1]
and entity["entity"].split("-")[0] != "B"
)
and entity["index"] == entity_group_disagg[-1]["index"] + 1 and entity["index"] == entity_group_disagg[-1]["index"] + 1
): ) or is_subword:
# Modify subword type to be previous_type
if is_subword:
entity["entity"] = entity_group_disagg[-1]["entity"].split("-")[-1]
entity["score"] = np.nan # set ignored scores to nan and use np.nanmean
entity_group_disagg += [entity] entity_group_disagg += [entity]
# Group the entities at the last entity # Group the entities at the last entity
if is_last_idx: if is_last_idx:
......
import unittest import unittest
from transformers import pipeline from transformers import AutoTokenizer, pipeline
from transformers.pipelines import Pipeline from transformers.pipelines import Pipeline
from transformers.testing_utils import require_tf from transformers.testing_utils import require_tf, require_torch
from .test_pipelines_common import CustomInputPipelineCommonMixin from .test_pipelines_common import CustomInputPipelineCommonMixin
...@@ -19,38 +19,54 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): ...@@ -19,38 +19,54 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
def _test_pipeline(self, nlp: Pipeline): def _test_pipeline(self, nlp: Pipeline):
output_keys = {"entity", "word", "score"} output_keys = {"entity", "word", "score"}
if nlp.grouped_entities:
output_keys = {"entity_group", "word", "score"}
ungrouped_ner_inputs = [ ungrouped_ner_inputs = [
[ [
{"entity": "B-PER", "index": 1, "score": 0.9994944930076599, "word": "Cons"}, {"entity": "B-PER", "index": 1, "score": 0.9994944930076599, "is_subword": False, "word": "Cons"},
{"entity": "B-PER", "index": 2, "score": 0.8025449514389038, "word": "##uelo"}, {"entity": "B-PER", "index": 2, "score": 0.8025449514389038, "is_subword": True, "word": "##uelo"},
{"entity": "I-PER", "index": 3, "score": 0.9993102550506592, "word": "Ara"}, {"entity": "I-PER", "index": 3, "score": 0.9993102550506592, "is_subword": False, "word": "Ara"},
{"entity": "I-PER", "index": 4, "score": 0.9993743896484375, "word": "##új"}, {"entity": "I-PER", "index": 4, "score": 0.9993743896484375, "is_subword": True, "word": "##új"},
{"entity": "I-PER", "index": 5, "score": 0.9992871880531311, "word": "##o"}, {"entity": "I-PER", "index": 5, "score": 0.9992871880531311, "is_subword": True, "word": "##o"},
{"entity": "I-PER", "index": 6, "score": 0.9993029236793518, "word": "No"}, {"entity": "I-PER", "index": 6, "score": 0.9993029236793518, "is_subword": False, "word": "No"},
{"entity": "I-PER", "index": 7, "score": 0.9981776475906372, "word": "##guera"}, {"entity": "I-PER", "index": 7, "score": 0.9981776475906372, "is_subword": True, "word": "##guera"},
{"entity": "B-PER", "index": 15, "score": 0.9998136162757874, "word": "Andrés"}, {"entity": "B-PER", "index": 15, "score": 0.9998136162757874, "is_subword": False, "word": "Andrés"},
{"entity": "I-PER", "index": 16, "score": 0.999740719795227, "word": "Pas"}, {"entity": "I-PER", "index": 16, "score": 0.999740719795227, "is_subword": False, "word": "Pas"},
{"entity": "I-PER", "index": 17, "score": 0.9997414350509644, "word": "##tran"}, {"entity": "I-PER", "index": 17, "score": 0.9997414350509644, "is_subword": True, "word": "##tran"},
{"entity": "I-PER", "index": 18, "score": 0.9996136426925659, "word": "##a"}, {"entity": "I-PER", "index": 18, "score": 0.9996136426925659, "is_subword": True, "word": "##a"},
{"entity": "B-ORG", "index": 28, "score": 0.9989739060401917, "word": "Far"}, {"entity": "B-ORG", "index": 28, "score": 0.9989739060401917, "is_subword": False, "word": "Far"},
{"entity": "I-ORG", "index": 29, "score": 0.7188422083854675, "word": "##c"}, {"entity": "I-ORG", "index": 29, "score": 0.7188422083854675, "is_subword": True, "word": "##c"},
], ],
[ [
{"entity": "I-PER", "index": 1, "score": 0.9968166351318359, "word": "En"}, {"entity": "I-PER", "index": 1, "score": 0.9968166351318359, "is_subword": False, "word": "En"},
{"entity": "I-PER", "index": 2, "score": 0.9957635998725891, "word": "##zo"}, {"entity": "I-PER", "index": 2, "score": 0.9957635998725891, "is_subword": True, "word": "##zo"},
{"entity": "I-ORG", "index": 7, "score": 0.9986497163772583, "word": "UN"}, {"entity": "I-ORG", "index": 7, "score": 0.9986497163772583, "is_subword": False, "word": "UN"},
], ],
] ]
expected_grouped_ner_results = [ expected_grouped_ner_results = [
[ [
{"entity_group": "B-PER", "score": 0.9710702640669686, "word": "Consuelo Araújo Noguera"}, {"entity_group": "PER", "score": 0.999369223912557, "word": "Consuelo Araújo Noguera"},
{"entity_group": "B-PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"}, {"entity_group": "PER", "score": 0.9997771680355072, "word": "Andrés Pastrana"},
{"entity_group": "B-ORG", "score": 0.8589080572128296, "word": "Farc"}, {"entity_group": "ORG", "score": 0.9989739060401917, "word": "Farc"},
],
[
{"entity_group": "PER", "score": 0.9968166351318359, "word": "Enzo"},
{"entity_group": "ORG", "score": 0.9986497163772583, "word": "UN"},
],
]
expected_grouped_ner_results_w_subword = [
[
{"entity_group": "PER", "score": 0.9994944930076599, "word": "Cons"},
{"entity_group": "PER", "score": 0.9663328925768534, "word": "##uelo Araújo Noguera"},
{"entity_group": "PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"},
{"entity_group": "ORG", "score": 0.8589080572128296, "word": "Farc"},
], ],
[ [
{"entity_group": "I-PER", "score": 0.9962901175022125, "word": "Enzo"}, {"entity_group": "PER", "score": 0.9962901175022125, "word": "Enzo"},
{"entity_group": "I-ORG", "score": 0.9986497163772583, "word": "UN"}, {"entity_group": "ORG", "score": 0.9986497163772583, "word": "UN"},
], ],
] ]
...@@ -77,12 +93,80 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): ...@@ -77,12 +93,80 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
for key in output_keys: for key in output_keys:
self.assertIn(key, result) self.assertIn(key, result)
for ungrouped_input, grouped_result in zip(ungrouped_ner_inputs, expected_grouped_ner_results): if nlp.grouped_entities:
self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result) if nlp.ignore_subwords:
for ungrouped_input, grouped_result in zip(ungrouped_ner_inputs, expected_grouped_ner_results):
self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result)
else:
for ungrouped_input, grouped_result in zip(
ungrouped_ner_inputs, expected_grouped_ner_results_w_subword
):
self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result)
@require_tf @require_tf
def test_tf_only(self): def test_tf_only(self):
model_name = "Narsil/small" # This model only has a TensorFlow version model_name = "Narsil/small" # This model only has a TensorFlow version
# We test that if we don't specificy framework='tf', it gets detected automatically # We test that if we don't specificy framework='tf', it gets detected automatically
nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp) self._test_pipeline(nlp)
# offset=tokenizer(VALID_INPUTS[0],return_offsets_mapping=True)['offset_mapping']
# pipeline_running_kwargs = {"offset_mapping"} # Additional kwargs to run the pipeline with
@require_tf
def test_tf_defaults(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="tf")
self._test_pipeline(nlp)
@require_tf
def test_tf_small(self):
for model_name in self.small_models:
print(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
model=model_name,
tokenizer=tokenizer,
framework="tf",
grouped_entities=True,
ignore_subwords=True,
)
self._test_pipeline(nlp)
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
model=model_name,
tokenizer=tokenizer,
framework="tf",
grouped_entities=True,
ignore_subwords=False,
)
self._test_pipeline(nlp)
@require_torch
def test_pt_defaults(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp)
@require_torch
def test_torch_small(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=True
)
self._test_pipeline(nlp)
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=False
)
self._test_pipeline(nlp)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment